(unknown)

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* (unknown)
@ 2005-07-25 22:44 Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (9 more replies)
  0 siblings, 10 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 0/7] shared subtree

Hi Andrew/Al Viro,

	Enclosing a final set of well tested patches that implement
	Al Viro's shared subtree proposal.

	These patches provide the ability to mark a mount tree as
	shared/private/slave/unclone, along with the ability to play with these
	trees with operations like bind/rbind/move/pivot_root/namespace-clone
	etc.

	I believe this powerful feature can help build features like
	per-user namespace.  Couple of projects may benefit from
	shared subtrees.
	1) automounter for the ability to automount across namespaces.
	2) SeLinux for implementing polyinstantiated trees.
	3) MVFS for providing versioning file system.
	4) FUSE for per-user namespaces?
	
	Thanks to Avantika for developing about 100+ test cases that tests
	various combintation of private/shared/slave/unclonable trees. All
	these tests have passed. I feel pretty confident about the stability of
	the code.
	
	The patches have been broken into 7 units, for ease of review.  I
	realize that patch-3 'rbind.patch' is a bit heavier than all the other
	patches. The reason being, most of the shared-subtree functionality 
	gets manifestated during bind/rbind operation.

	Couple of work items to be done are:
	1. modify the mount command to support this feature
		eg:  mount --make-shared /tmp
	2. a tool that can help visualize the propogation tree, maybe
		support in /proc?
	3. some documentation on how to use all this functionality.

	Please consider the patches for inclusion in your tree.

	The footprint of this code is pretty small in the normal code path
	where shared-subtree functionality is not used.

	Any suggestions/comments to improve the code is welcome.

Thanks,
RP

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 1/7] shared subtree
Content-Type: text/x-patch; name=shared_private_slave.patch
Content-Disposition: inline; filename=shared_private_slave.patch

This patch adds the shared/private/slave support for VFS trees.

Signed by Ram Pai (linuxram@us.ibm.com)

 fs/Makefile           |    2 
 fs/dcache.c           |    2 
 fs/namespace.c        |   93 ++++++++++
 fs/pnode.c            |  441 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h    |    5 
 include/linux/mount.h |   44 ++++
 include/linux/pnode.h |   90 ++++++++++
 7 files changed, 673 insertions(+), 4 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -22,6 +22,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/pnode.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
@@ -62,6 +63,7 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_fslink);
+		INIT_LIST_HEAD(&mnt->mnt_pnode_mntlist);
 		if (name) {
 			int size = strlen(name)+1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -615,6 +617,95 @@ out_unlock:
 	return err;
 }
 
+static int do_make_shared(struct vfsmount *mnt)
+{
+	int err=0;
+	struct vfspnode *old_pnode = NULL;
+	/*
+	 * if the mount is already a slave mount,
+	 * allocate a new pnode and make it
+	 * a slave pnode of the original pnode.
+	 */
+	if (IS_MNT_SLAVE(mnt)) {
+		old_pnode = mnt->mnt_pnode;
+		pnode_del_slave_mnt(mnt);
+	}
+	if(!IS_MNT_SHARED(mnt)) {
+		mnt->mnt_pnode = pnode_alloc();
+		if(!mnt->mnt_pnode) {
+			pnode_add_slave_mnt(old_pnode, mnt);
+			err = -ENOMEM;
+			goto out;
+		}
+		pnode_add_member_mnt(mnt->mnt_pnode, mnt);
+	}
+	if(old_pnode)
+		pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
+	set_mnt_shared(mnt);
+out:
+	return err;
+}
+
+static int do_make_slave(struct vfsmount *mnt)
+{
+	int err=0;
+
+	if (IS_MNT_SLAVE(mnt))
+		goto out;
+	/*
+	 * only shared mounts can
+	 * be made slave
+	 */
+	if (!IS_MNT_SHARED(mnt)) {
+		err = -EINVAL;
+		goto out;
+	}
+	pnode_member_to_slave(mnt);
+out:
+	return err;
+}
+
+static int do_make_private(struct vfsmount *mnt)
+{
+	if(mnt->mnt_pnode)
+		pnode_disassociate_mnt(mnt);
+	set_mnt_private(mnt);
+	return 0;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct nameidata *nd, int flag)
+{
+	struct vfsmount *m, *mnt = nd->mnt;
+	int err=0;
+
+	if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+			&& !(flag & MS_SLAVE))
+		return -EINVAL;
+
+	if (nd->dentry != nd->mnt->mnt_root)
+		return -EINVAL;
+
+	spin_lock(&vfsmount_lock);
+	for (m = mnt; m; m = next_mnt(m, mnt)) {
+		switch (flag) {
+		case MS_SHARED:
+			err = do_make_shared(m);
+			break;
+		case MS_SLAVE:
+			err = do_make_slave(m);
+			break;
+		case MS_PRIVATE:
+			err = do_make_private(m);
+			break;
+		}
+	}
+	spin_unlock(&vfsmount_lock);
+	return err;
+}
+
 /*
  * do loopback mount.
  */
@@ -1049,6 +1140,8 @@ long do_mount(char * dev_name, char * di
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+	else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
 	else
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- /dev/null
+++ 2.6.12.work2/fs/pnode.c
@@ -0,0 +1,441 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *	Author : Ram Pai (linuxram@us.ibm.com)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/acct.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/namespace.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/pnode.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <stdarg.h>
+
+
+static kmem_cache_t * pnode_cachep;
+
+/* spinlock for pnode related operations */
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+
+enum pnode_vfs_type {
+	PNODE_MEMBER_VFS = 0x01,
+	PNODE_SLAVE_VFS = 0x02
+};
+
+void __init pnode_init(unsigned long mempages)
+{
+	pnode_cachep = kmem_cache_create("pnode_cache",
+                       sizeof(struct vfspnode), 0,
+                       SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
+
+struct vfspnode * pnode_alloc(void)
+{
+	struct vfspnode *pnode =  kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
+	INIT_LIST_HEAD(&pnode->pnode_vfs);
+	INIT_LIST_HEAD(&pnode->pnode_slavevfs);
+	INIT_LIST_HEAD(&pnode->pnode_slavepnode);
+	INIT_LIST_HEAD(&pnode->pnode_peer_slave);
+	pnode->pnode_master = NULL;
+	pnode->pnode_flags = 0;
+	atomic_set(&pnode->pnode_count,0);
+	return pnode;
+}
+
+void inline pnode_free(struct vfspnode *pnode)
+{
+	kmem_cache_free(pnode_cachep, pnode);
+}
+
+/*
+ * __put_pnode() should be called with vfspnode_lock held
+ */
+void __put_pnode(struct vfspnode *pnode)
+{
+	struct vfspnode *tmp_pnode;
+	do {
+		tmp_pnode = pnode->pnode_master;
+		list_del_init(&pnode->pnode_peer_slave);
+		BUG_ON(!list_empty(&pnode->pnode_vfs));
+		BUG_ON(!list_empty(&pnode->pnode_slavevfs));
+		BUG_ON(!list_empty(&pnode->pnode_slavepnode));
+		pnode_free(pnode);
+		pnode = tmp_pnode;
+		if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
+			break;
+	} while(pnode);
+}
+
+static void inline pnode_add_mnt(struct vfspnode *pnode,
+		struct vfsmount *mnt, int slave)
+{
+	if (!pnode || !mnt)
+		return;
+	spin_lock(&vfspnode_lock);
+	mnt->mnt_pnode = pnode;
+	if (slave) {
+		set_mnt_slave(mnt);
+		list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+	} else {
+		set_mnt_shared(mnt);
+		list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_vfs);
+	}
+	get_pnode(pnode);
+	spin_unlock(&vfspnode_lock);
+}
+
+void pnode_add_member_mnt(struct vfspnode *pnode,
+		struct vfsmount *mnt)
+{
+	pnode_add_mnt(pnode, mnt, 0);
+}
+
+void pnode_add_slave_mnt(struct vfspnode *pnode,
+		struct vfsmount *mnt)
+{
+	pnode_add_mnt(pnode, mnt, 1);
+}
+
+
+void pnode_add_slave_pnode(struct vfspnode *pnode,
+		struct vfspnode *slave_pnode)
+{
+	if (!pnode || !slave_pnode)
+		return;
+	spin_lock(&vfspnode_lock);
+	slave_pnode->pnode_master = pnode;
+	slave_pnode->pnode_flags = 0;
+	list_add(&slave_pnode->pnode_peer_slave, &pnode->pnode_slavepnode);
+	get_pnode(pnode);
+	spin_unlock(&vfspnode_lock);
+}
+
+/*
+ * merge 'pnode' into 'peer_pnode' and get rid of pnode
+ * @pnode: pnode the contents of which have to be merged
+ * @peer_pnode: pnode into which the contents are merged
+ */
+int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
+{
+	struct vfspnode *slave_pnode, *pnext;
+	struct vfsmount *mnt, *slave_mnt, *next;
+
+	list_for_each_entry_safe(slave_pnode,  pnext,
+			&pnode->pnode_slavepnode, pnode_peer_slave) {
+		slave_pnode->pnode_master = peer_pnode;
+		list_move(&slave_pnode->pnode_peer_slave,
+				&peer_pnode->pnode_slavepnode);
+		put_pnode_locked(pnode);
+		get_pnode(peer_pnode);
+	}
+
+	list_for_each_entry_safe(slave_mnt,  next,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+		slave_mnt->mnt_pnode = peer_pnode;
+		list_move(&slave_mnt->mnt_pnode_mntlist,
+				&peer_pnode->pnode_slavevfs);
+		put_pnode_locked(pnode);
+		get_pnode(peer_pnode);
+	}
+
+	list_for_each_entry_safe(mnt, next,
+			&pnode->pnode_vfs, mnt_pnode_mntlist) {
+		mnt->mnt_pnode = peer_pnode;
+		list_move(&mnt->mnt_pnode_mntlist,
+				&peer_pnode->pnode_vfs);
+		put_pnode_locked(pnode);
+		get_pnode(peer_pnode);
+	}
+	return 0;
+}
+
+/*
+ * called when pnode has no member mounts.  Merge all the slave mounts/pnodes
+ * of this pnode with that of its master pnode. If master pnode does not exit,
+ * convert all the slave mounts to private mounts.
+ */
+static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
+	*next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
+
+	if ((master_pnode = pnode->pnode_master)) {
+		pnode->pnode_master = NULL;
+		list_del_init(&pnode->pnode_peer_slave);
+		pnode_merge_pnode(pnode, master_pnode);
+		put_pnode_locked(master_pnode);
+	} else {
+		list_for_each_entry_safe(slave_mnt, next,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+			list_del_init(&slave_mnt->mnt_pnode_mntlist);
+			set_mnt_private(slave_mnt);
+			put_pnode_locked(pnode);
+		}
+		list_for_each_entry_safe(slave_pnode,  pnext,
+			&pnode->pnode_slavepnode, pnode_peer_slave) {
+			slave_pnode->pnode_master = NULL;
+			list_del_init(&slave_pnode->pnode_peer_slave);
+			put_pnode_locked(pnode);
+		}
+	}
+}
+
+static void __pnode_disassociate_mnt(struct vfsmount *mnt)
+{
+	struct vfspnode *pnode = mnt->mnt_pnode;
+
+	spin_lock(&vfspnode_lock);
+	list_del_init(&mnt->mnt_pnode_mntlist);
+
+	if (list_empty(&pnode->pnode_vfs))
+		empty_pnode(pnode);
+
+	put_pnode_locked(pnode);
+
+	spin_unlock(&vfspnode_lock);
+	mnt->mnt_pnode = NULL;
+}
+
+void pnode_del_slave_mnt(struct vfsmount *mnt)
+{
+	if (!mnt || !mnt->mnt_pnode)
+		return;
+ 	__pnode_disassociate_mnt(mnt);
+	CLEAR_MNT_SLAVE(mnt);
+}
+
+void pnode_del_member_mnt(struct vfsmount *mnt)
+{
+	if (!mnt || !mnt->mnt_pnode)
+		return;
+ 	__pnode_disassociate_mnt(mnt);
+	CLEAR_MNT_SHARED(mnt);
+}
+
+void pnode_member_to_slave(struct vfsmount *mnt)
+{
+	struct vfspnode *pnode = mnt->mnt_pnode;
+	if (!mnt || !pnode)
+		return;
+
+	spin_lock(&vfspnode_lock);
+
+	list_del_init(&mnt->mnt_pnode_mntlist);
+	list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+	set_mnt_slave(mnt);
+
+	if (list_empty(&pnode->pnode_vfs))
+		empty_pnode(pnode);
+
+	spin_unlock(&vfspnode_lock);
+	return;
+}
+
+void pnode_disassociate_mnt(struct vfsmount *mnt)
+{
+	if (!mnt || !mnt->mnt_pnode)
+		return;
+ 	__pnode_disassociate_mnt(mnt);
+	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_SLAVE(mnt);
+}
+
+struct pcontext {
+	struct vfspnode *start;
+	int	level;
+	struct vfspnode *master_pnode;
+	struct vfspnode *pnode;
+};
+
+/*
+ * Walk the pnode tree for each pnode encountered.
+ * @context: provides context on the state of the last walk in the pnode
+ * 		tree.
+ */
+static int pnode_next(struct pcontext *context)
+{
+	struct vfspnode *pnode = context->pnode;
+	struct vfspnode	*master_pnode=context->master_pnode;
+	struct list_head *next;
+
+	if (!pnode) {
+		BUG_ON(!context->start);
+		get_pnode(context->start);
+		context->pnode = context->start;
+		context->master_pnode = NULL;
+		context->level = 0;
+		return 1;
+	}
+
+	spin_lock(&vfspnode_lock);
+	next = pnode->pnode_slavepnode.next;
+	if (next == &pnode->pnode_slavepnode) {
+		while (1) {
+			int flag;
+
+			if (pnode == context->start) {
+				put_pnode_locked(pnode);
+				spin_unlock(&vfspnode_lock);
+				BUG_ON(context->level != 0);
+				return 0;
+			}
+
+			next = pnode->pnode_peer_slave.next;
+			flag = (next != &pnode->pnode_master->pnode_slavepnode);
+			put_pnode_locked(pnode);
+
+			if (flag)
+				break;
+
+			pnode = master_pnode;
+			master_pnode = pnode->pnode_master;
+			context->level--;
+		}
+	} else {
+		master_pnode = pnode;
+		context->level++;
+	}
+
+	pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
+	get_pnode(pnode);
+
+	context->pnode = pnode;
+	context->master_pnode = master_pnode;
+	spin_unlock(&vfspnode_lock);
+	return 1;
+}
+
+/*
+ * skip the rest of the tree, cleaning up
+ * reference to pnodes held in pnode_next().
+ */
+static void pnode_end(struct pcontext *context)
+{
+	struct vfspnode *p = context->pnode;
+	struct vfspnode *start = context->start;
+
+	do {
+		put_pnode(p);
+	} while (p != start && (p = p->pnode_master));
+	return;
+}
+
+/*
+ * traverse the pnode tree and at each pnode encountered, execute the
+ * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
+ *
+ * @pnode: pnode tree to be traversed
+ * @in_data: input data
+ * @out_data: output data
+ * @pnode_func: function to be called when a new pnode is encountered.
+ * @vfs_func: function to be called on each slave and member vfs belonging
+ * 		to the pnode.
+ */
+static int pnode_traverse(struct vfspnode *pnode,
+		void *in_data,
+		void **out_data,
+		int (*pnode_pre_func)(struct vfspnode *,
+			void *, void **, va_list),
+		int (*pnode_post_func)(struct vfspnode *,
+			void *, va_list),
+		int (*vfs_func)(struct vfsmount *,
+			enum pnode_vfs_type, void *,  va_list),
+		...)
+{
+	va_list args;
+	int ret = 0, level;
+	void *my_data, *data_from_master;
+     	struct vfspnode *master_pnode;
+     	struct vfsmount *slave_mnt, *member_mnt, *t_m;
+	struct pcontext context;
+	static void *p_array[PNODE_MAX_SLAVE_LEVEL];
+
+	context.start = pnode;
+	context.pnode = NULL;
+	/*
+	 * determine whether to process vfs first or the
+	 * slave pnode first
+	 */
+	while (pnode_next(&context)) {
+		level = context.level;
+
+		if (level >= PNODE_MAX_SLAVE_LEVEL)
+			goto error;
+
+		pnode = context.pnode;
+		master_pnode = context.master_pnode;
+
+		if (master_pnode) {
+			data_from_master = p_array[level-1];
+			my_data = NULL;
+		} else {
+			data_from_master = NULL;
+			my_data = in_data;
+		}
+
+		if (pnode_pre_func) {
+			va_start(args, vfs_func);
+			if((ret = pnode_pre_func(pnode,
+				data_from_master, &my_data, args)))
+				goto error;
+			va_end(args);
+		}
+
+		// traverse member vfsmounts
+		spin_lock(&vfspnode_lock);
+		list_for_each_entry_safe(member_mnt,
+			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
+
+			spin_unlock(&vfspnode_lock);
+			va_start(args, vfs_func);
+			if ((ret = vfs_func(member_mnt,
+				PNODE_MEMBER_VFS, my_data, args)))
+				goto error;
+			va_end(args);
+			spin_lock(&vfspnode_lock);
+		}
+		list_for_each_entry_safe(slave_mnt, t_m,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+
+			spin_unlock(&vfspnode_lock);
+			va_start(args, vfs_func);
+			if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
+				my_data, args)))
+				goto error;
+			va_end(args);
+			spin_lock(&vfspnode_lock);
+		}
+		spin_unlock(&vfspnode_lock);
+
+		if (pnode_post_func) {
+			va_start(args, vfs_func);
+			if((ret = pnode_post_func(pnode,
+				my_data, args)))
+				goto error;
+			va_end(args);
+		}
+
+		p_array[level] = my_data;
+	}
+out:
+	if (out_data)
+		*out_data = p_array[0];
+	return ret;
+error:
+	va_end(args);
+	pnode_end(&context);
+	goto out;
+}
Index: 2.6.12.work2/fs/dcache.c
===================================================================
--- 2.6.12.work2.orig/fs/dcache.c
+++ 2.6.12.work2/fs/dcache.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/file.h>
+#include <linux/pnode.h>
 #include <asm/uaccess.h>
 #include <linux/security.h>
 #include <linux/seqlock.h>
@@ -1737,6 +1738,7 @@ void __init vfs_caches_init(unsigned lon
 	inode_init(mempages);
 	files_init(mempages);
 	mnt_init(mempages);
+	pnode_init(mempages);
 	bdev_cache_init();
 	chrdev_init();
 }
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -102,6 +102,9 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_PRIVATE	(1<<18) /* recursively change to private */
+#define MS_SLAVE	(1<<19) /* recursively change to slave */
+#define MS_SHARED	(1<<20) /* recursively change to shared */
 #define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
@@ -232,6 +235,7 @@ extern void update_atime (struct inode *
 extern void __init inode_init(unsigned long);
 extern void __init inode_init_early(void);
 extern void __init mnt_init(unsigned long);
+extern void __init pnode_init(unsigned long);
 extern void __init files_init(unsigned long);
 
 struct buffer_head;
@@ -1211,6 +1215,7 @@ extern struct vfsmount *kern_mount(struc
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);
 
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- /dev/null
+++ 2.6.12.work2/include/linux/pnode.h
@@ -0,0 +1,90 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *
+ */
+#ifndef _LINUX_PNODE_H
+#define _LINUX_PNODE_H
+
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct vfspnode {
+	struct list_head pnode_vfs; 	 /* list of vfsmounts anchored here */
+	struct list_head pnode_slavevfs; /* list of slave vfsmounts */
+	struct list_head pnode_slavepnode;/* list of slave pnode */
+	struct list_head pnode_peer_slave;/* going through master's slave pnode
+					    list*/
+	struct vfspnode	 *pnode_master;	  /* master pnode */
+	int 		 pnode_flags;
+	atomic_t 	 pnode_count;
+};
+#define PNODE_MAX_SLAVE_LEVEL 32  /* MAXIMUM DEPTH OF THE PNODE TREE */
+#define PNODE_DELETE  0x01
+#define PNODE_SLAVE   0x02
+
+#define IS_PNODE_DELETE(pn)  ((pn->pnode_flags&PNODE_DELETE)==PNODE_DELETE)
+#define IS_PNODE_SLAVE(pn)  ((pn->pnode_flags&PNODE_SLAVE)==PNODE_SLAVE)
+#define SET_PNODE_DELETE(pn)  pn->pnode_flags |= PNODE_DELETE
+#define SET_PNODE_SLAVE(pn)  pn->pnode_flags |= PNODE_SLAVE
+
+extern spinlock_t vfspnode_lock;
+extern void __put_pnode(struct vfspnode *);
+
+static inline struct vfspnode *
+get_pnode(struct vfspnode *pnode)
+{
+	if (!pnode)
+		return NULL;
+	atomic_inc(&pnode->pnode_count);
+	return pnode;
+}
+
+static inline void
+put_pnode(struct vfspnode *pnode)
+{
+	if (!pnode)
+		return;
+	if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
+		__put_pnode(pnode);
+		spin_unlock(&vfspnode_lock);
+	}
+}
+
+/*
+ * must be called holding the vfspnode_lock
+ */
+static inline void
+put_pnode_locked(struct vfspnode *pnode)
+{
+	if (!pnode)
+		return;
+	if (atomic_dec_and_test(&pnode->pnode_count)) {
+		__put_pnode(pnode);
+	}
+}
+
+void __init pnode_init(unsigned long );
+struct vfspnode * pnode_alloc(void);
+void pnode_add_slave_mnt(struct vfspnode *, struct vfsmount *);
+void pnode_add_member_mnt(struct vfspnode *, struct vfsmount *);
+void pnode_del_slave_mnt(struct vfsmount *);
+void pnode_del_member_mnt(struct vfsmount *);
+void pnode_disassociate_mnt(struct vfsmount *);
+void pnode_add_slave_pnode(struct vfspnode *, struct vfspnode *);
+struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
+		struct dentry *);
+void pnode_member_to_slave(struct vfsmount *);
+int pnode_merge_pnode(struct vfspnode *, struct vfspnode *);
+struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
+		struct dentry *);
+int  pnode_make_unmounted(struct vfspnode *);
+int pnode_prepare_mount(struct vfspnode *, struct vfspnode *, struct dentry *,
+		struct vfsmount *, struct vfsmount *);
+int pnode_commit_mount(struct vfspnode *, int);
+int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
+#endif /* _LINUX_PNODE_H */
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -16,9 +16,21 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
-#define MNT_NOSUID	1
-#define MNT_NODEV	2
-#define MNT_NOEXEC	4
+#define MNT_NOSUID	0x01
+#define MNT_NODEV	0x02
+#define MNT_NOEXEC	0x04
+#define MNT_PRIVATE	0x10  /* if the vfsmount is private, by default it is private*/
+#define MNT_SLAVE	0x20  /* if the vfsmount is a slave mount of its pnode */
+#define MNT_SHARED	0x40  /* if the vfsmount is a slave mount of its pnode */
+#define MNT_PNODE_MASK	0xf0  /* propogation flag mask */
+
+#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
+#define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+
+#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
+#define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
+#define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
 
 struct vfsmount
 {
@@ -29,6 +41,10 @@ struct vfsmount
 	struct super_block *mnt_sb;	/* pointer to superblock */
 	struct list_head mnt_mounts;	/* list of children, anchored here */
 	struct list_head mnt_child;	/* and going through their mnt_child */
+	struct list_head mnt_pnode_mntlist;/* and going through their
+					   pnode's vfsmount */
+	struct vfspnode *mnt_pnode;	/* and going through their
+					   pnode's vfsmount */
 	atomic_t mnt_count;
 	int mnt_flags;
 	int mnt_expiry_mark;		/* true if marked for expiry */
@@ -38,6 +54,28 @@ struct vfsmount
 	struct namespace *mnt_namespace; /* containing namespace */
 };
 
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
+	CLEAR_MNT_PRIVATE(mnt);
+	CLEAR_MNT_SLAVE(mnt);
+}
+
+static inline void set_mnt_private(struct vfsmount *mnt)
+{
+	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
+	CLEAR_MNT_SLAVE(mnt);
+	CLEAR_MNT_SHARED(mnt);
+	mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_slave(struct vfsmount *mnt)
+{
+	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SLAVE;
+	CLEAR_MNT_PRIVATE(mnt);
+	CLEAR_MNT_SHARED(mnt);
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
Index: 2.6.12.work2/fs/Makefile
===================================================================
--- 2.6.12.work2.orig/fs/Makefile
+++ 2.6.12.work2/fs/Makefile
@@ -8,7 +8,7 @@
 obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		block_dev.o char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
-		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
+		attr.o bad_inode.o file.o filesystems.o namespace.o pnode.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
 
 obj-$(CONFIG_EPOLL)		+= eventpoll.o

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2/7] shared subtree
Content-Type: text/x-patch; name=unclone.patch
Content-Disposition: inline; filename=unclone.patch

 Adds the ability to unclone a vfs tree. A uncloned vfs tree will not be
 clonnable, and hence cannot be bind/rbind to any other mountpoint.

 RP

Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c        |   15 ++++++++++++++-
 include/linux/fs.h    |    1 +
 include/linux/mount.h |   15 +++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -673,6 +673,14 @@ static int do_make_private(struct vfsmou
 	return 0;
 }
 
+static int do_make_unclone(struct vfsmount *mnt)
+{
+	if(mnt->mnt_pnode)
+		pnode_disassociate_mnt(mnt);
+	set_mnt_unclone(mnt);
+	return 0;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -682,6 +690,7 @@ static int do_change_type(struct nameida
 	int err=0;
 
 	if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+			&& !(flag & MS_UNCLONE)
 			&& !(flag & MS_SLAVE))
 		return -EINVAL;
 
@@ -700,6 +709,9 @@ static int do_change_type(struct nameida
 		case MS_PRIVATE:
 			err = do_make_private(m);
 			break;
+		case MS_UNCLONE:
+			err = do_make_unclone(m);
+			break;
 		}
 	}
 	spin_unlock(&vfsmount_lock);
@@ -1140,7 +1152,8 @@ long do_mount(char * dev_name, char * di
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
-	else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+	else if (flags & MS_SHARED || flags & MS_UNCLONE ||
+			flags & MS_PRIVATE || flags & MS_SLAVE)
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -102,6 +102,7 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_UNCLONE	(1<<17) /* recursively change to unclonnable */
 #define MS_PRIVATE	(1<<18) /* recursively change to private */
 #define MS_SLAVE	(1<<19) /* recursively change to slave */
 #define MS_SHARED	(1<<20) /* recursively change to shared */
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -22,15 +22,18 @@
 #define MNT_PRIVATE	0x10  /* if the vfsmount is private, by default it is private*/
 #define MNT_SLAVE	0x20  /* if the vfsmount is a slave mount of its pnode */
 #define MNT_SHARED	0x40  /* if the vfsmount is a slave mount of its pnode */
+#define MNT_UNCLONE	0x80  /* if the vfsmount is unclonable */
 #define MNT_PNODE_MASK	0xf0  /* propogation flag mask */
 
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
 #define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+#define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
 
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
 #define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
 #define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
+#define CLEAR_MNT_UNCLONE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_UNCLONE))
 
 struct vfsmount
 {
@@ -59,6 +62,7 @@ static inline void set_mnt_shared(struct
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
 	CLEAR_MNT_PRIVATE(mnt);
 	CLEAR_MNT_SLAVE(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
 }
 
 static inline void set_mnt_private(struct vfsmount *mnt)
@@ -66,6 +70,16 @@ static inline void set_mnt_private(struc
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
 	CLEAR_MNT_SLAVE(mnt);
 	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
+	mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_unclone(struct vfsmount *mnt)
+{
+	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_UNCLONE;
+	CLEAR_MNT_SLAVE(mnt);
+	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_PRIVATE(mnt);
 	mnt->mnt_pnode = NULL;
 }
 
@@ -74,6 +88,7 @@ static inline void set_mnt_slave(struct 
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SLAVE;
 	CLEAR_MNT_PRIVATE(mnt);
 	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
 }
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 3/7] shared subtree
Content-Type: text/x-patch; name=rbind.patch
Content-Disposition: inline; filename=rbind.patch

Adds the ability to bind/rbind a shared/private/slave subtree and set up
propogation wherever needed.

RP

Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c            |  660 ++++++++++++++++++++++++++++++++++++++++------
 fs/pnode.c                |  235 ++++++++++++++++
 include/linux/dcache.h    |    2 
 include/linux/fs.h        |    5 
 include/linux/namespace.h |    1 
 5 files changed, 826 insertions(+), 77 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -42,7 +42,8 @@ static inline int sysfs_init(void)
 
 static struct list_head *mount_hashtable;
 static int hash_mask, hash_bits;
-static kmem_cache_t *mnt_cache; 
+static kmem_cache_t *mnt_cache;
+static struct rw_semaphore namespace_sem;
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
 
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
-	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
+	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		memset(mnt, 0, sizeof(struct vfsmount));
 		atomic_set(&mnt->mnt_count,1);
@@ -86,7 +87,8 @@ void free_vfsmnt(struct vfsmount *mnt)
  * Now, lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+		struct dentry *root)
 {
 	struct list_head * head = mount_hashtable + hash(mnt, dentry);
 	struct list_head * tmp = head;
@@ -99,7 +101,8 @@ struct vfsmount *lookup_mnt(struct vfsmo
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
-		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry &&
+				(root == NULL || p->mnt_root == root)) {
 			found = mntget(p);
 			break;
 		}
@@ -108,6 +111,37 @@ struct vfsmount *lookup_mnt(struct vfsmo
 	return found;
 }
 
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+	return __lookup_mnt(mnt, dentry, NULL);
+}
+
+static struct vfsmount *
+clone_mnt(struct vfsmount *old, struct dentry *root)
+{
+	struct super_block *sb = old->mnt_sb;
+	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+
+	if (mnt) {
+		mnt->mnt_flags = old->mnt_flags;
+		atomic_inc(&sb->s_active);
+		mnt->mnt_sb = sb;
+		mnt->mnt_root = dget(root);
+		mnt->mnt_mountpoint = mnt->mnt_root;
+		mnt->mnt_parent = mnt;
+		mnt->mnt_namespace = old->mnt_namespace;
+		mnt->mnt_pnode = get_pnode(old->mnt_pnode);
+
+		/* stick the duplicate mount on the same expiry list
+		 * as the original if that was on one */
+		spin_lock(&vfsmount_lock);
+		if (!list_empty(&old->mnt_fslink))
+			list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+		spin_unlock(&vfsmount_lock);
+	}
+	return mnt;
+}
+
 static inline int check_mnt(struct vfsmount *mnt)
 {
 	return mnt->mnt_namespace == current->namespace;
@@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
 {
 	mnt->mnt_parent = mntget(nd->mnt);
 	mnt->mnt_mountpoint = dget(nd->dentry);
-	list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+	mnt->mnt_namespace = nd->mnt->mnt_namespace;
+	list_add_tail(&mnt->mnt_hash,
+			mount_hashtable+hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
 	nd->dentry->d_mounted++;
 }
 
+static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+	mnt->mnt_parent = mntget(nd->mnt);
+	mnt->mnt_mountpoint = dget(nd->dentry);
+	nd->dentry->d_mounted++;
+}
+
+
+void do_attach_commit_mnt(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	BUG_ON(parent==mnt);
+	if(list_empty(&mnt->mnt_hash))
+		list_add_tail(&mnt->mnt_hash,
+			mount_hashtable+hash(parent, mnt->mnt_mountpoint));
+	if(list_empty(&mnt->mnt_child))
+		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	mnt->mnt_namespace = parent->mnt_namespace;
+	list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
+}
+
+struct vfsmount *do_attach_prepare_mnt(struct vfsmount *mnt,
+		struct dentry *dentry,
+		struct vfsmount *template_mnt,
+		int clone_flag)
+{
+	struct vfsmount *child_mnt;
+	struct nameidata nd;
+
+	if (clone_flag) {
+		if(!(child_mnt = clone_mnt(template_mnt,
+				template_mnt->mnt_root)))
+			return NULL;
+	} else
+		child_mnt = template_mnt;
+
+	nd.mnt = mnt;
+	nd.dentry = dentry;
+
+	attach_prepare_mnt(child_mnt, &nd);
+
+	return child_mnt;
+}
+
+void do_detach_prepare_mnt(struct vfsmount *mnt, int free_flag)
+{
+	mnt->mnt_mountpoint->d_mounted--;
+	mntput(mnt->mnt_parent);
+	dput(mnt->mnt_mountpoint);
+	if (free_flag) {
+		BUG_ON(atomic_read(&mnt->mnt_count) != 1);
+		spin_lock(&vfsmount_lock);
+		list_del_init(&mnt->mnt_fslink);
+		spin_unlock(&vfsmount_lock);
+		mntput(mnt);
+	}
+}
+
 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
@@ -149,29 +243,14 @@ static struct vfsmount *next_mnt(struct 
 	return list_entry(next, struct vfsmount, mnt_child);
 }
 
-static struct vfsmount *
-clone_mnt(struct vfsmount *old, struct dentry *root)
+static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
 {
-	struct super_block *sb = old->mnt_sb;
-	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
-
-	if (mnt) {
-		mnt->mnt_flags = old->mnt_flags;
-		atomic_inc(&sb->s_active);
-		mnt->mnt_sb = sb;
-		mnt->mnt_root = dget(root);
-		mnt->mnt_mountpoint = mnt->mnt_root;
-		mnt->mnt_parent = mnt;
-		mnt->mnt_namespace = old->mnt_namespace;
-
-		/* stick the duplicate mount on the same expiry list
-		 * as the original if that was on one */
-		spin_lock(&vfsmount_lock);
-		if (!list_empty(&old->mnt_fslink))
-			list_add(&mnt->mnt_fslink, &old->mnt_fslink);
-		spin_unlock(&vfsmount_lock);
+	struct list_head *prev = p->mnt_mounts.prev;
+	while (prev != &p->mnt_mounts) {
+		p = list_entry(prev, struct vfsmount, mnt_child);
+		prev = p->mnt_mounts.prev;
 	}
-	return mnt;
+	return p;
 }
 
 void __mntput(struct vfsmount *mnt)
@@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
 	struct list_head *p;
 	loff_t l = *pos;
 
-	down_read(&n->sem);
+	down_read(&namespace_sem);
 	list_for_each(p, &n->list)
 		if (!l--)
 			return list_entry(p, struct vfsmount, mnt_list);
@@ -208,8 +287,7 @@ static void *m_next(struct seq_file *m, 
 
 static void m_stop(struct seq_file *m, void *v)
 {
-	struct namespace *n = m->private;
-	up_read(&n->sem);
+	up_read(&namespace_sem);
 }
 
 static inline void mangle(struct seq_file *m, const char *s)
@@ -433,7 +511,7 @@ static int do_umount(struct vfsmount *mn
 		return retval;
 	}
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
 
 	if (atomic_read(&sb->s_active) == 1) {
@@ -455,7 +533,7 @@ static int do_umount(struct vfsmount *mn
 	spin_unlock(&vfsmount_lock);
 	if (retval)
 		security_sb_umount_busy(mnt);
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	return retval;
 }
 
@@ -495,9 +573,9 @@ out:
 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
 
 /*
- *	The 2.0 compatible umount. No flags. 
+ *	The 2.0 compatible umount. No flags.
  */
- 
+
 asmlinkage long sys_oldumount(char __user * name)
 {
 	return sys_umount(name,0);
@@ -541,6 +619,9 @@ static struct vfsmount *copy_tree(struct
 	struct list_head *h;
 	struct nameidata nd;
 
+	if (IS_MNT_UNCLONE(mnt))
+		return NULL;
+
 	res = q = clone_mnt(mnt, dentry);
 	if (!q)
 		goto Enomem;
@@ -549,10 +630,15 @@ static struct vfsmount *copy_tree(struct
 	p = mnt;
 	for (h = mnt->mnt_mounts.next; h != &mnt->mnt_mounts; h = h->next) {
 		r = list_entry(h, struct vfsmount, mnt_child);
+
 		if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry))
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
+			if (IS_MNT_UNCLONE(s)) {
+				s = skip_mnt_tree(s);
+				continue;
+			}
 			while (p != s->mnt_parent) {
 				p = p->mnt_parent;
 				q = q->mnt_parent;
@@ -579,9 +665,276 @@ static struct vfsmount *copy_tree(struct
 	return NULL;
 }
 
+/*
+ * return 1 if the mount tree contains a shared or slave mount
+ */
+static inline int tree_contains_sharedorslave(struct vfsmount *mnt)
+{
+	struct vfsmount *p;
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		if (IS_MNT_SHARED(p) || IS_MNT_SLAVE(p))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * commit the operations done in attach_recursive_mnt(). run through pnode list
+ * headed at 'pnodehead', and commit the operation done in
+ * attach_recursive_mnt();
+ */
+
+static void commit_attach_recursive_mnt(struct list_head *pnodehead)
+{
+	struct vfspnode *t_p, *tmp_pnode;
+
+	/*
+	 * Merge or delete or slave each of the temporary pnode
+	 */
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry_safe(tmp_pnode, t_p, pnodehead,
+			pnode_peer_slave) {
+
+		int del_flag = IS_PNODE_DELETE(tmp_pnode);
+		int slave_flag = IS_PNODE_SLAVE(tmp_pnode);
+		struct vfspnode *master_pnode = tmp_pnode->pnode_master;
+
+		list_del_init(&tmp_pnode->pnode_peer_slave);
+		pnode_commit_mount(tmp_pnode, del_flag);
+
+		if (!del_flag && master_pnode) {
+			tmp_pnode->pnode_master = NULL;
+
+			if (slave_flag)
+				pnode_add_slave_pnode(master_pnode, tmp_pnode);
+			else
+				pnode_merge_pnode(tmp_pnode, master_pnode);
+
+			/*
+			 * we don't need the extra reference to
+			 * the master_pnode, which was created either
+			 * (a) pnode_add_slave_pnode: when the mnt
+			 * 	was made as a slave mnt.
+			 * (b) pnode_merge_pnode: during clone_mnt().
+			 */
+			put_pnode(master_pnode);
+		}
+	}
+	spin_unlock(&vfsmount_lock);
+}
+
+/*
+ * abort the operations done in attach_recursive_mnt(). run through the mount
+ * tree, till vfsmount 'last' and undo the changes.  Ensure that all the mounts
+ * in the tree are all back in the mnt_list headed at 'source_mnt'.
+ * NOTE: This function is closely tied to the logic in
+ * 'attach_recursive_mnt()'
+ */
+static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
+		vfsmount *last, struct list_head *head) { struct vfsmount *p =
+	source_mnt, *m; struct vfspnode *src_pnode;
+
+	if (!last)
+		return;
+
+	do {
+		int is_unclone, is_pnode_slave;
+
+		m = p;
+		is_unclone = IS_MNT_UNCLONE(m);
+
+		BUG_ON(!m->mnt_pnode);
+
+		is_pnode_slave = IS_PNODE_SLAVE(m->mnt_pnode);
+		src_pnode = m->mnt_pnode->pnode_master;
+		m->mnt_pnode->pnode_master = NULL;
+		pnode_abort_mount(m->mnt_pnode, m);
+
+		m->mnt_pnode = src_pnode;
+		if (src_pnode) {
+			if(is_pnode_slave)
+				set_mnt_slave(m);
+			else
+				set_mnt_shared(m);
+		} else {
+			if (is_unclone)
+				set_mnt_unclone(m);
+			else
+				set_mnt_private(m);
+		}
+
+
+		list_add_tail(&m->mnt_list, head);
+		p = next_mnt(m, source_mnt);
+
+	} while ( p && m != last );
+	source_mnt->mnt_parent = source_mnt;
+	list_del_init(head);
+}
+
+ /*
+ *  @source_mnt : mount tree to be attached
+ *  @nd		: place the mount tree @source_mnt is attached
+ *
+ *  NOTE: in the table below explains the semantics when a source vfsmount
+ *  of a given type is attached to a destination vfsmount of a give type.
+ *  ---------------------------------------------------------------------
+ *  |				BIND MOUNT OPERATION			|
+ *  |*******************************************************************|
+ *  |  dest --> | shared	|	private	 |  slave   |unclonable	|
+ *  | source	|		|       	 |   	    |    	|
+ *  |   |   	|		|       	 |   	    |    	|
+ *  |   v 	|		|       	 |   	    |    	|
+ *  |*******************************************************************|
+ *  |	     	|		|       	 |   	    |    	|
+ *  |  shared	| shared (++) 	|      shared (+)|shared (+)| shared (+)|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | private	| shared (+)	|      private	 | private  | private  	|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | slave	| shared (+)	|      private   | private  | private  	|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | unclonable|    nomount	|       nomount	 |  nomount | nomount 	|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *   ********************************************************************
+ *
+ * (++)  the mount will be propogated to all the vfsmounts in the pnode tree
+ *    	  of the destination vfsmount, and all the non-slave new mounts in
+ *    	  destination vfsmount will be added the source vfsmount's pnode.
+ * (+)  the mount will be propogated to the destination vfsmount
+ *    	  and the new mount will be added to the source vfsmount's pnode.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each vfsmount in the tree.
+ *
+ * Should be called without spinlocks held, because this function can sleep
+ * in allocations.
+ *
+  */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+		struct nameidata *nd)
+{
+	struct vfsmount *mntpt_mnt, *last, *m, *p;
+	struct vfspnode *src_pnode, *dest_pnode, *tmp_pnode;
+	struct dentry *mntpt_dentry;
+	int ret;
+	LIST_HEAD(pnodehead);
+	LIST_HEAD(mnt_list_head);
+
+	/*
+	 * if the source tree has no shared or slave mounts and
+	 * the destination mount is not shared, fastpath.
+	 */
+	mntpt_mnt = nd->mnt;
+	dest_pnode = IS_MNT_SHARED(mntpt_mnt) ? mntpt_mnt->mnt_pnode : NULL;
+	if (!dest_pnode && !tree_contains_sharedorslave(source_mnt)) {
+		spin_lock(&vfsmount_lock);
+		attach_mnt(source_mnt, nd);
+		list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
+		list_splice(&mnt_list_head, mntpt_mnt->mnt_namespace->list.prev);
+		spin_unlock(&vfsmount_lock);
+		goto out;
+	}
+
+	/*
+	 * Create temporary pnodes which shall hold all the new
+	 * mounts. Merge or delete or slave that pnode later in a separate
+	 * operation, depending on the type of source and destination mounts.
+	 */
+	p = NULL;
+	last = NULL;
+	list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
+
+	for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
+
+		BUG_ON(IS_MNT_UNCLONE(m));
+
+		while (p && p != m->mnt_parent)
+			p = p->mnt_parent;
+
+		if (!p) {
+			mntpt_dentry = nd->dentry;
+			mntpt_mnt = nd->mnt;
+		} else {
+			mntpt_dentry = m->mnt_mountpoint;
+			mntpt_mnt    = p;
+		}
+		p=m;
+
+		dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
+			mntpt_mnt->mnt_pnode : NULL;
+		src_pnode = (IS_MNT_SHARED(m))?
+				m->mnt_pnode : NULL;
+
+		/*
+		 * get a temporary pnode into which add the new vfs, and keep
+		 * track of these pnodes and their real pnode.
+		 */
+		if (!(tmp_pnode = pnode_alloc())) {
+			ret =  -ENOMEM;
+			goto error;
+		}
+
+		m->mnt_pnode = NULL;
+		list_del_init(&m->mnt_list);
+		list_add_tail(&tmp_pnode->pnode_peer_slave, &pnodehead);
+
+		if (dest_pnode) {
+			if ((ret = pnode_prepare_mount(dest_pnode, tmp_pnode,
+					mntpt_dentry, m, mntpt_mnt))) {
+				tmp_pnode->pnode_master = src_pnode;
+				m->mnt_pnode = tmp_pnode;
+				last = m;
+				goto error;
+			}
+		} else {
+			if (m == m->mnt_parent)
+				do_attach_prepare_mnt(mntpt_mnt,
+					mntpt_dentry, m, 0);
+			pnode_add_member_mnt(tmp_pnode, m);
+			if (!src_pnode) {
+				set_mnt_private(m);
+				/*
+				 * NOTE: set_mnt_private()
+				 * resets m->mnt_pnode.
+				 * Reinitialize it. This is needed to
+				 * decrement the refcount on the
+				 * pnode when the mount 'm' is
+				 * unlinked in pnode_commit_mount().
+				 */
+				m->mnt_pnode = tmp_pnode;
+				SET_PNODE_DELETE(tmp_pnode);
+			}
+		}
+
+		/*
+		 * temporarily track the pnode with which the tmp_pnode
+		 * has to merge with; in the pnode_master field.
+		 */
+		tmp_pnode->pnode_master = src_pnode;
+		last = m;
+	}
+	commit_attach_recursive_mnt(&pnodehead);
+out:
+	mntget(source_mnt);
+	return 0;
+error:
+	/*
+	 * ok we have errored out either because of memory exhaustion
+	 * or something else not in our control. Gracefully return
+	 * leaving no mess behind. Else it will haunt you. :(
+	 */
+	abort_attach_recursive_mnt(source_mnt, last, &mnt_list_head);
+	return 1;
+}
+
 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 {
-	int err;
+	int err, ret;
+
 	if (mnt->mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
@@ -599,17 +952,12 @@ static int graft_tree(struct vfsmount *m
 		goto out_unlock;
 
 	err = -ENOENT;
-	spin_lock(&vfsmount_lock);
-	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
-		struct list_head head;
 
-		attach_mnt(mnt, nd);
-		list_add_tail(&head, &mnt->mnt_list);
-		list_splice(&head, current->namespace->list.prev);
-		mntget(mnt);
-		err = 0;
-	}
+	spin_lock(&vfsmount_lock);
+	ret = (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry));
 	spin_unlock(&vfsmount_lock);
+	if (ret)
+		err = attach_recursive_mnt(mnt, nd);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
@@ -681,6 +1029,147 @@ static int do_make_unclone(struct vfsmou
 	return 0;
 }
 
+ /*
+ * This operation is equivalent of mount --bind dir dir
+ * create a new mount at the dentry, and unmount all child mounts
+ * mounted on top of dentries below 'dentry', and mount them
+ * under the new mount.
+  */
+struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct vfsmount *child_mnt, *next;
+	struct nameidata nd;
+	struct vfsmount *newmnt = clone_mnt(mnt, dentry);
+	LIST_HEAD(head);
+
+	/*
+	 * note clone_mnt() gets a reference to the pnode.
+	 * we won't use that pnode anyway. So just let it
+	 * go
+	 */
+	put_pnode(newmnt->mnt_pnode);
+	newmnt->mnt_pnode = NULL;
+
+	if (newmnt) {
+		/*
+		 * walk through the mount list of mnt and move
+		 * them under the new mount
+		 */
+		spin_lock(&vfsmount_lock);
+		list_del_init(&newmnt->mnt_fslink);
+
+		list_for_each_entry_safe(child_mnt, next,
+				&mnt->mnt_mounts, mnt_child) {
+
+			if(child_mnt->mnt_mountpoint == dentry)
+				continue;
+
+			if(!is_subdir(child_mnt->mnt_mountpoint, dentry))
+				continue;
+
+			detach_mnt(child_mnt, &nd);
+			nd.mnt = newmnt;
+			attach_mnt(child_mnt, &nd);
+		}
+
+		nd.mnt = mnt;
+		nd.dentry = dentry;
+		attach_mnt(newmnt, &nd);
+		list_add_tail(&newmnt->mnt_list, &newmnt->mnt_namespace->list);
+ 		spin_unlock(&vfsmount_lock);
+ 	}
+	return newmnt;
+}
+
+ /*
+ * Inverse operation of do_make_mounted()
+  */
+int do_make_unmounted(struct vfsmount *mnt)
+{
+	struct vfsmount *parent_mnt, *child_mnt, *next;
+	struct nameidata nd;
+
+	/* validate if mount has a different parent */
+	parent_mnt = mnt->mnt_parent;
+	if (mnt == parent_mnt)
+		return 0;
+	/*
+	 * cannot unmount a mount that is not created
+	 * as a overlay mount.
+	 */
+	if (mnt->mnt_mountpoint != mnt->mnt_root)
+		return -EINVAL;
+
+	/* for each submounts in the parent, put the mounts back */
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry_safe(child_mnt, next, &mnt->mnt_mounts, mnt_child) {
+		detach_mnt(child_mnt, &nd);
+		nd.mnt = parent_mnt;
+		attach_mnt(child_mnt, &nd);
+ 	}
+	detach_mnt(mnt, &nd);
+ 	spin_unlock(&vfsmount_lock);
+	return 0;
+}
+
+/*
+ * @nd: contains the vfsmount and the dentry where the new mount
+ * 	is the be created
+ * @mnt: returns the newly created mount.
+ * Create a new mount at the location specified by 'nd' and
+ * propogate the mount to all other mounts if the mountpoint
+ * is under a shared mount.
+ */
+int make_mounted(struct nameidata *nd, struct vfsmount **mnt)
+{
+	struct vfsmount *parent_mnt;
+	struct dentry *parent_dentry;
+	int err = mount_is_safe(nd);
+	if (err)
+		return err;
+	parent_dentry = nd->dentry;
+	parent_mnt = nd->mnt;
+ 	/*
+	 * check if dentry already has a vfsmount
+	 * if it does not, create and attach
+	 * a new vfsmount at that dentry.
+	 * Also propogate the mount if parent_mnt
+	 * is shared.
+ 	 */
+	if(parent_dentry != parent_mnt->mnt_root) {
+		*mnt = IS_MNT_SHARED(parent_mnt) ?
+			 pnode_make_mounted(parent_mnt->mnt_pnode,
+					 parent_mnt, parent_dentry) :
+			 do_make_mounted(parent_mnt, parent_dentry);
+		if (!*mnt)
+			err = -ENOMEM;
+ 	} else
+		*mnt = parent_mnt;
+	return err;
+}
+
+ /*
+ * Inverse operation of make_mounted()
+  */
+int make_unmounted(struct vfsmount *mnt)
+{
+	if (mnt == mnt->mnt_parent)
+		return 0;
+	/*
+	 * cannot unmount a mount that is not created
+	 * as a overlay mount.
+	 */
+	if (mnt->mnt_mountpoint != mnt->mnt_root)
+		return -EINVAL;
+
+	if (IS_MNT_SHARED(mnt))
+		pnode_make_unmounted(mnt->mnt_pnode);
+ 	else
+		do_make_unmounted(mnt);
+
+	return 0;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -724,7 +1213,7 @@ static int do_change_type(struct nameida
 static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 {
 	struct nameidata old_nd;
-	struct vfsmount *mnt = NULL;
+	struct vfsmount *mnt = NULL, *overlay_mnt=NULL;
 	int err = mount_is_safe(nd);
 	if (err)
 		return err;
@@ -734,14 +1223,31 @@ static int do_loopback(struct nameidata 
 	if (err)
 		return err;
 
-	down_write(&current->namespace->sem);
+	if (IS_MNT_UNCLONE(old_nd.mnt)) {
+		err = -EINVAL;
+		goto path_release;
+	}
+
+	down_write(&namespace_sem);
 	err = -EINVAL;
 	if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+
+		/*
+		 * If the dentry is not the root dentry, and if a bind
+		 * from a shared subtree is attempted, create a mount
+		 * at the dentry, and use the new mount as the starting
+		 * point for the bind/rbind operation.
+		 */
+		overlay_mnt = old_nd.mnt;
+		if(IS_MNT_SHARED(old_nd.mnt) &&
+			(err = make_mounted(&old_nd, &overlay_mnt)))
+			goto out;
+
 		err = -ENOMEM;
 		if (recurse)
-			mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+			mnt = copy_tree(overlay_mnt, old_nd.dentry);
 		else
-			mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+			mnt = clone_mnt(overlay_mnt, old_nd.dentry);
 	}
 
 	if (mnt) {
@@ -752,15 +1258,25 @@ static int do_loopback(struct nameidata 
 
 		err = graft_tree(mnt, nd);
 		if (err) {
-			spin_lock(&vfsmount_lock);
-			umount_tree(mnt);
-			spin_unlock(&vfsmount_lock);
-		} else
-			mntput(mnt);
-	}
+ 			spin_lock(&vfsmount_lock);
+ 			umount_tree(mnt);
+ 			spin_unlock(&vfsmount_lock);
+			/*
+			 * ok we failed! so undo any overlay
+			 * mount that we did earlier.
+			 */
+			if (old_nd.mnt !=  overlay_mnt)
+				make_unmounted(overlay_mnt);
+ 		} else
+ 			mntput(mnt);
+ 	}
+
+ out:
+	up_write(&namespace_sem);
+
+ path_release:
+ 	path_release(&old_nd);
 
-	up_write(&current->namespace->sem);
-	path_release(&old_nd);
 	return err;
 }
 
@@ -808,7 +1324,7 @@ static int do_move_mount(struct nameidat
 	if (err)
 		return err;
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
 	err = -EINVAL;
@@ -852,7 +1368,7 @@ out2:
 out1:
 	up(&nd->dentry->d_inode->i_sem);
 out:
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	if (!err)
 		path_release(&parent_nd);
 	path_release(&old_nd);
@@ -891,7 +1407,7 @@ int do_add_mount(struct vfsmount *newmnt
 {
 	int err;
 
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
 		;
@@ -920,7 +1436,7 @@ int do_add_mount(struct vfsmount *newmnt
 	}
 
 unlock:
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	mntput(newmnt);
 	return err;
 }
@@ -976,7 +1492,7 @@ void mark_mounts_for_expiry(struct list_
 		get_namespace(namespace);
 
 		spin_unlock(&vfsmount_lock);
-		down_write(&namespace->sem);
+		down_write(&namespace_sem);
 		spin_lock(&vfsmount_lock);
 
 		/* check that it is still dead: the count should now be 2 - as
@@ -1020,7 +1536,7 @@ void mark_mounts_for_expiry(struct list_
 			spin_unlock(&vfsmount_lock);
 		}
 
-		up_write(&namespace->sem);
+		up_write(&namespace_sem);
 
 		mntput(mnt);
 		put_namespace(namespace);
@@ -1066,7 +1582,7 @@ int copy_mount_options(const void __user
 	int i;
 	unsigned long page;
 	unsigned long size;
-	
+
 	*where = 0;
 	if (!data)
 		return 0;
@@ -1085,7 +1601,7 @@ int copy_mount_options(const void __user
 
 	i = size - exact_copy_from_user((void *)page, data, size);
 	if (!i) {
-		free_page(page); 
+		free_page(page);
 		return -EFAULT;
 	}
 	if (i != PAGE_SIZE)
@@ -1191,14 +1707,13 @@ int copy_namespace(int flags, struct tas
 		goto out;
 
 	atomic_set(&new_ns->count, 1);
-	init_rwsem(&new_ns->sem);
 	INIT_LIST_HEAD(&new_ns->list);
 
-	down_write(&tsk->namespace->sem);
+	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
 	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
 	if (!new_ns->root) {
-		up_write(&tsk->namespace->sem);
+		up_write(&namespace_sem);
 		kfree(new_ns);
 		goto out;
 	}
@@ -1232,7 +1747,7 @@ int copy_namespace(int flags, struct tas
 		p = next_mnt(p, namespace->root);
 		q = next_mnt(q, new_ns->root);
 	}
-	up_write(&tsk->namespace->sem);
+	up_write(&namespace_sem);
 
 	tsk->namespace = new_ns;
 
@@ -1414,7 +1929,7 @@ asmlinkage long sys_pivot_root(const cha
 	user_nd.mnt = mntget(current->fs->rootmnt);
 	user_nd.dentry = dget(current->fs->root);
 	read_unlock(&current->fs->lock);
-	down_write(&current->namespace->sem);
+	down_write(&namespace_sem);
 	down(&old_nd.dentry->d_inode->i_sem);
 	error = -EINVAL;
 	if (!check_mnt(user_nd.mnt))
@@ -1460,7 +1975,7 @@ asmlinkage long sys_pivot_root(const cha
 	path_release(&parent_nd);
 out2:
 	up(&old_nd.dentry->d_inode->i_sem);
-	up_write(&current->namespace->sem);
+	up_write(&namespace_sem);
 	path_release(&user_nd);
 	path_release(&old_nd);
 out1:
@@ -1487,7 +2002,6 @@ static void __init init_mount_tree(void)
 		panic("Can't allocate initial namespace");
 	atomic_set(&namespace->count, 1);
 	INIT_LIST_HEAD(&namespace->list);
-	init_rwsem(&namespace->sem);
 	list_add(&mnt->mnt_list, &namespace->list);
 	namespace->root = mnt;
 	mnt->mnt_namespace = namespace;
@@ -1510,6 +2024,8 @@ void __init mnt_init(unsigned long mempa
 	unsigned int nr_hash;
 	int i;
 
+	init_rwsem(&namespace_sem);
+
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
@@ -1557,7 +2073,7 @@ void __put_namespace(struct namespace *n
 {
 	struct vfsmount *mnt;
 
-	down_write(&namespace->sem);
+	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
 
 	list_for_each_entry(mnt, &namespace->list, mnt_list) {
@@ -1566,6 +2082,6 @@ void __put_namespace(struct namespace *n
 
 	umount_tree(namespace->root);
 	spin_unlock(&vfsmount_lock);
-	up_write(&namespace->sem);
+	up_write(&namespace_sem);
 	kfree(namespace);
 }
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -26,7 +26,6 @@
 #include <asm/unistd.h>
 #include <stdarg.h>
 
-
 static kmem_cache_t * pnode_cachep;
 
 /* spinlock for pnode related operations */
@@ -90,10 +89,12 @@ static void inline pnode_add_mnt(struct 
 	mnt->mnt_pnode = pnode;
 	if (slave) {
 		set_mnt_slave(mnt);
-		list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+		list_add(&mnt->mnt_pnode_mntlist,
+				&pnode->pnode_slavevfs);
 	} else {
 		set_mnt_shared(mnt);
-		list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_vfs);
+		list_add(&mnt->mnt_pnode_mntlist,
+				&pnode->pnode_vfs);
 	}
 	get_pnode(pnode);
 	spin_unlock(&vfspnode_lock);
@@ -111,7 +112,6 @@ void pnode_add_slave_mnt(struct vfspnode
 	pnode_add_mnt(pnode, mnt, 1);
 }
 
-
 void pnode_add_slave_pnode(struct vfspnode *pnode,
 		struct vfspnode *slave_pnode)
 {
@@ -439,3 +439,230 @@ error:
 	pnode_end(&context);
 	goto out;
 }
+
+int pnode_mount_func(struct vfspnode *pnode, void *indata,
+		void **outdata, va_list args)
+{
+	struct vfspnode *pnode_slave, *pnode_master;
+	int ret=0;
+
+	pnode_master = indata;
+
+	if (*outdata)
+		pnode_slave = *outdata;
+	else if (!(pnode_slave = pnode_alloc()))
+		return -ENOMEM;
+
+	*outdata = pnode_slave;
+
+	if (pnode_slave && pnode_master)
+		pnode_add_slave_pnode(pnode_master, pnode_slave);
+	return ret;
+}
+
+int vfs_make_mounted_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	struct dentry *target_dentry;
+	int ret=0;
+	struct vfsmount *child_mount;
+	struct vfspnode *pnode;
+
+	target_dentry = va_arg(args, struct dentry *);
+	if (!(child_mount = do_make_mounted(mnt, target_dentry))) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	pnode = (struct vfspnode *)indata;
+	switch (flag) {
+	case PNODE_SLAVE_VFS :
+		pnode_add_slave_mnt(pnode, child_mount);
+		break;
+	case PNODE_MEMBER_VFS :
+		pnode_add_member_mnt(pnode, child_mount);
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * @pnode: pnode that contains the vfsmounts, on which the
+ *  		new mount is created at dentry 'dentry'
+ * @dentry: the dentry on which the new mount is created
+ * @mnt:   return the mount created on this vfsmount
+ * walk through all the vfsmounts belonging to this pnode
+ * as well as its slave pnodes and for each vfsmount create
+ * a new vfsmount at 'dentry'.  Return the vfsmount created
+ * at 'dentry' of vfsmount 'mnt'.
+ */
+struct vfsmount *pnode_make_mounted(struct vfspnode *pnode,
+		struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct vfsmount *child_mnt;
+	struct vfspnode *child_pnode;
+
+	if (!(child_pnode = pnode_alloc()))
+		return NULL;
+
+	if (pnode_traverse(pnode, child_pnode, (void *)NULL,
+			pnode_mount_func, NULL, vfs_make_mounted_func,
+			(void *)dentry))
+  		goto error;
+	child_mnt = __lookup_mnt(mnt, dentry, dentry);
+	mntput(child_mnt);
+	return child_mnt;
+
+error:
+	pnode_make_unmounted(child_pnode);
+	return NULL;
+}
+
+int vfs_make_unmounted_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	struct vfspnode *pnode;
+	int ret=0;
+
+	if (do_make_unmounted(mnt)) {
+		ret = 1;
+		goto out;
+	}
+
+	pnode = mnt->mnt_pnode;
+	spin_lock(&vfspnode_lock);
+	list_del_init(&mnt->mnt_pnode_mntlist);
+	put_pnode_locked(pnode);
+	spin_unlock(&vfspnode_lock);
+out:
+	return ret;
+}
+
+int pnode_make_unmounted(struct vfspnode *pnode)
+{
+	return pnode_traverse(pnode, NULL, (void *)NULL,
+			NULL, NULL, vfs_make_unmounted_func);
+}
+
+int vfs_prepare_mount_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	struct vfsmount *source_mnt, *child_mnt, *p_mnt;
+	struct dentry *mountpoint_dentry;
+	struct vfspnode *pnode = (struct vfspnode *)indata;
+
+	source_mnt = va_arg(args, struct vfsmount * );
+	mountpoint_dentry =  va_arg(args, struct dentry *);
+	p_mnt =  va_arg(args, struct vfsmount *);
+
+	if ((p_mnt != mnt) || (source_mnt == source_mnt->mnt_parent)) {
+		child_mnt = do_attach_prepare_mnt(mnt, mountpoint_dentry,
+				source_mnt, (p_mnt != mnt));
+		if (!child_mnt)
+			return -ENOMEM;
+
+		if (child_mnt != source_mnt)
+			put_pnode(source_mnt->mnt_pnode);
+	} else
+		child_mnt = source_mnt;
+
+	switch (flag) {
+	case PNODE_SLAVE_VFS :
+		pnode_add_slave_mnt(pnode, child_mnt);
+		break;
+	case PNODE_MEMBER_VFS :
+		pnode_add_member_mnt(pnode, child_mnt);
+		break;
+	}
+
+	return 0;
+}
+
+int pnode_prepare_mount(struct vfspnode *pnode,
+		struct vfspnode *master_child_pnode,
+		struct dentry *mountpoint_dentry,
+		struct vfsmount *source_mnt,
+		struct vfsmount *mnt)
+{
+	return  pnode_traverse(pnode,
+			master_child_pnode,
+			(void *)NULL,
+			pnode_mount_func,
+			NULL,
+			vfs_prepare_mount_func,
+			source_mnt,
+			mountpoint_dentry,
+			mnt);
+}
+
+int pnode_commit_mount_post_func(struct vfspnode *pnode, void *indata,
+		va_list args)
+{
+	if (va_arg(args, int)) {
+		spin_lock(&vfspnode_lock);
+		BUG_ON(!list_empty(&pnode->pnode_vfs));
+		BUG_ON(!list_empty(&pnode->pnode_slavevfs));
+		BUG_ON(!list_empty(&pnode->pnode_slavepnode));
+		list_del_init(&pnode->pnode_peer_slave);
+		put_pnode_locked(pnode);
+		spin_unlock(&vfspnode_lock);
+	}
+	return 0;
+}
+
+int vfs_commit_mount_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	BUG_ON(mnt == mnt->mnt_parent);
+	do_attach_commit_mnt(mnt);
+	if (va_arg(args, int)) {
+		spin_lock(&vfspnode_lock);
+		list_del_init(&mnt->mnt_pnode_mntlist);
+		put_pnode_locked(mnt->mnt_pnode);
+		spin_unlock(&vfspnode_lock);
+		mnt->mnt_pnode = NULL;
+	}
+	return 0;
+}
+
+/*
+ * @pnode: walk the propogation tree and complete the
+ * 	attachments of the child mounts to the parents
+ * 	correspondingly.
+ * @flag: if set destroy the propogation tree
+ */
+int pnode_commit_mount(struct vfspnode *pnode, int flag)
+{
+	return  pnode_traverse(pnode,
+			NULL, (void *)NULL, NULL, pnode_commit_mount_post_func,
+			vfs_commit_mount_func, flag);
+}
+
+int vfs_abort_mount_func(struct vfsmount *mnt,
+		enum pnode_vfs_type flag, void *indata, va_list args)
+
+{
+	struct vfsmount *exception_mnt = va_arg(args, struct vfsmount *);
+	BUG_ON(!mnt->mnt_pnode);
+	pnode_disassociate_mnt(mnt);
+	do_detach_prepare_mnt(mnt, (exception_mnt != mnt));
+	return 0;
+}
+
+/*
+ * clean the propogation tree under pnode, releasing all
+ * the mounts, except exception_mnt
+ * @pnode: the pnode tree to be cleanup unlinking and
+ * 	releasing all pnodes in the tree as well as
+ * 	unlinking any mounts, except 'exception_mnt'
+ * @exception_mnt: the mnt to be unlinked from pnode
+ * 		bug not released.
+ */
+int pnode_abort_mount(struct vfspnode *pnode,
+		struct vfsmount *exception_mnt)
+{
+	return  pnode_traverse(pnode,
+			NULL, (void *)NULL, NULL, NULL,
+			vfs_abort_mount_func, exception_mnt);
+}
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -1216,7 +1216,12 @@ extern struct vfsmount *kern_mount(struc
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *do_attach_prepare_mnt(struct vfsmount *,
+		struct dentry *, struct vfsmount *, int);
+extern void do_attach_commit_mnt(struct vfsmount *);
 extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
+extern int do_make_unmounted(struct vfsmount *);
+extern void do_detach_prepare_mnt(struct vfsmount *, int);
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);
 
Index: 2.6.12.work2/include/linux/namespace.h
===================================================================
--- 2.6.12.work2.orig/include/linux/namespace.h
+++ 2.6.12.work2/include/linux/namespace.h
@@ -9,7 +9,6 @@ struct namespace {
 	atomic_t		count;
 	struct vfsmount *	root;
 	struct list_head	list;
-	struct rw_semaphore	sem;
 };
 
 extern void umount_tree(struct vfsmount *);
Index: 2.6.12.work2/include/linux/dcache.h
===================================================================
--- 2.6.12.work2.orig/include/linux/dcache.h
+++ 2.6.12.work2/include/linux/dcache.h
@@ -329,6 +329,8 @@ static inline int d_mountpoint(struct de
 }
 
 extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct vfsmount *__lookup_mnt(struct vfsmount *,
+		struct dentry *, struct dentry *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
                   ` (2 preceding siblings ...)
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 4/7] shared subtree
Content-Type: text/x-patch; name=move.patch
Content-Disposition: inline; filename=move.patch

Adds ability to move a shared/private/slave/unclone tree to any other
shared/private/slave/unclone tree. Also incorporates the same behavior
for pivot_root()

RP


Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c        |  196 +++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/mount.h |    2 
 2 files changed, 173 insertions(+), 25 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -772,9 +772,12 @@ static void abort_attach_recursive_mnt(s
 	list_del_init(head);
 }
 
+
  /*
  *  @source_mnt : mount tree to be attached
  *  @nd		: place the mount tree @source_mnt is attached
+ *  @move	: use the move semantics if set, else use normal attach semantics
+ *                as explained below
  *
  *  NOTE: in the table below explains the semantics when a source vfsmount
  *  of a given type is attached to a destination vfsmount of a give type.
@@ -801,12 +804,41 @@ static void abort_attach_recursive_mnt(s
  *  |		|		|       	 |   	    |    	|
  *   ********************************************************************
  *
- * (++)  the mount will be propogated to all the vfsmounts in the pnode tree
+ * (++)  the mount is propogated to all the vfsmounts in the pnode tree
  *    	  of the destination vfsmount, and all the non-slave new mounts in
  *    	  destination vfsmount will be added the source vfsmount's pnode.
- * (+)  the mount will be propogated to the destination vfsmount
+ * (+)  the mount is propogated to the destination vfsmount
  *    	  and the new mount will be added to the source vfsmount's pnode.
  *
+ *  ---------------------------------------------------------------------
+ *  |				MOVE MOUNT OPERATION			|
+ *  |*******************************************************************|
+ *  |  dest --> | shared	|	private	 |  slave   |unclonable	|
+ *  | source	|		|       	 |   	    |    	|
+ *  |   |   	|		|       	 |   	    |    	|
+ *  |   v 	|		|       	 |   	    |    	|
+ *  |*******************************************************************|
+ *  |	     	|		|       	 |   	    |    	|
+ *  |  shared	| shared (++) 	|      shared (+)|shared (+)| shared (+)|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | private	| shared (+)	|      private	 | private  | private  	|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | slave	| shared (+++)	|      slave     | slave    | slave  	|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *  | unclonable|  invalid	|     unclonable |unclonable| unclonable|
+ *  |		|		|       	 |   	    |    	|
+ *  |		|		|       	 |   	    |    	|
+ *   ********************************************************************
+ *
+ * (+++)  the mount is propogated to all the vfsmounts in the pnode tree
+ *    	  of the destination vfsmount, and all the new mounts is
+ *    	  added to a new pnode , which is a slave pnode of the
+ *    	  source vfsmount's pnode.
+ *
+ *
  * if the source mount is a tree, the operations explained above is
  * applied to each vfsmount in the tree.
  *
@@ -815,7 +847,7 @@ static void abort_attach_recursive_mnt(s
  *
   */
 static int attach_recursive_mnt(struct vfsmount *source_mnt,
-		struct nameidata *nd)
+		struct nameidata *nd, int move)
 {
 	struct vfsmount *mntpt_mnt, *last, *m, *p;
 	struct vfspnode *src_pnode, *dest_pnode, *tmp_pnode;
@@ -849,8 +881,8 @@ static int attach_recursive_mnt(struct v
 	list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
 
 	for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
-
-		BUG_ON(IS_MNT_UNCLONE(m));
+		int unclone = IS_MNT_UNCLONE(m);
+		int slave = IS_MNT_SLAVE(m);
 
 		while (p && p != m->mnt_parent)
 			p = p->mnt_parent;
@@ -866,7 +898,7 @@ static int attach_recursive_mnt(struct v
 
 		dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
 			mntpt_mnt->mnt_pnode : NULL;
-		src_pnode = (IS_MNT_SHARED(m))?
+		src_pnode = (IS_MNT_SHARED(m) || (move && slave))?
 				m->mnt_pnode : NULL;
 
 		/*
@@ -882,7 +914,7 @@ static int attach_recursive_mnt(struct v
 		list_del_init(&m->mnt_list);
 		list_add_tail(&tmp_pnode->pnode_peer_slave, &pnodehead);
 
-		if (dest_pnode) {
+		if (dest_pnode && !unclone) {
 			if ((ret = pnode_prepare_mount(dest_pnode, tmp_pnode,
 					mntpt_dentry, m, mntpt_mnt))) {
 				tmp_pnode->pnode_master = src_pnode;
@@ -890,23 +922,33 @@ static int attach_recursive_mnt(struct v
 				last = m;
 				goto error;
 			}
+			if (move && dest_pnode && slave)
+ 				SET_PNODE_SLAVE(tmp_pnode);
 		} else {
 			if (m == m->mnt_parent)
 				do_attach_prepare_mnt(mntpt_mnt,
 					mntpt_dentry, m, 0);
-			pnode_add_member_mnt(tmp_pnode, m);
-			if (!src_pnode) {
-				set_mnt_private(m);
+			if (move && slave)
+				pnode_add_slave_mnt(tmp_pnode, m);
+			else {
+				pnode_add_member_mnt(tmp_pnode, m);
+				if (unclone) {
+					BUG_ON(!move);
+					set_mnt_unclone(m);
+					m->mnt_pnode = tmp_pnode;
+					SET_PNODE_DELETE(tmp_pnode);
+				} else if (!src_pnode) {
+					set_mnt_private(m);
+					m->mnt_pnode = tmp_pnode;
+					SET_PNODE_DELETE(tmp_pnode);
+				}
 				/*
-				 * NOTE: set_mnt_private()
-				 * resets m->mnt_pnode.
-				 * Reinitialize it. This is needed to
-				 * decrement the refcount on the
-				 * pnode when the mount 'm' is
-				 * unlinked in pnode_commit_mount().
+				 * NOTE: set_mnt_private() & set_mnt_unclone()
+				 * resets m->mnt_pnode. Hence reinitialize it.
+				 * We need this to decrement the refcount
+				 * on the pnode when the mount 'm' is
+				 * unlinked in pnode_commit_mount()
 				 */
-				m->mnt_pnode = tmp_pnode;
-				SET_PNODE_DELETE(tmp_pnode);
 			}
 		}
 
@@ -931,6 +973,46 @@ error:
 	return 1;
 }
 
+static void
+detach_recursive_mnt(struct vfsmount *source_mnt, struct nameidata *nd)
+{
+	struct vfsmount *m;
+
+	detach_mnt(source_mnt, nd);
+	spin_lock(&vfspnode_lock);
+	for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
+		list_del_init(&m->mnt_pnode_mntlist);
+		list_del_init(&m->mnt_list);
+		if (m != source_mnt)
+			list_add_tail(&m->mnt_list, &source_mnt->mnt_list);
+	}
+	spin_unlock(&vfspnode_lock);
+}
+
+static void
+undo_detach_recursive_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+	struct vfsmount *m;
+	LIST_HEAD(head);
+
+	spin_lock(&vfspnode_lock);
+	for (m = mnt; m; m = next_mnt(m, mnt)) {
+		if (m->mnt_pnode) {
+			if (IS_MNT_SHARED(m))
+				list_add(&m->mnt_pnode_mntlist,
+					&m->mnt_pnode->pnode_vfs);
+			if (IS_MNT_SLAVE(m))
+				list_add(&m->mnt_pnode_mntlist,
+					&m->mnt_pnode->pnode_slavevfs);
+		}
+	}
+	attach_mnt(mnt, nd);
+	spin_unlock(&vfspnode_lock);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_splice(&head, nd->mnt->mnt_namespace->list.prev);
+}
+
 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 {
 	int err, ret;
@@ -957,7 +1039,7 @@ static int graft_tree(struct vfsmount *m
 	ret = (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry));
 	spin_unlock(&vfsmount_lock);
 	if (ret)
-		err = attach_recursive_mnt(mnt, nd);
+		err = attach_recursive_mnt(mnt, nd, 0);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
@@ -1311,6 +1393,19 @@ static int do_remount(struct nameidata *
 	return err;
 }
 
+/*
+ * return 1 if the mount tree contains a unclonable mount
+ */
+static inline int tree_contains_unclone(struct vfsmount *mnt)
+{
+	struct vfsmount *p;
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
+		if (IS_MNT_UNCLONE(p))
+			return 1;
+	}
+	return 0;
+}
+
 static int do_move_mount(struct nameidata *nd, char *old_name)
 {
 	struct nameidata old_nd, parent_nd;
@@ -1351,14 +1446,35 @@ static int do_move_mount(struct nameidat
 	      S_ISDIR(old_nd.dentry->d_inode->i_mode))
 		goto out2;
 
+	/*
+	 * Don't move a mount in a shared parent.
+	 */
+	if (old_nd.mnt->mnt_parent &&
+		IS_MNT_SHARED(old_nd.mnt->mnt_parent))
+		goto out2;
+
+	/*
+	 * Don't move a mount tree having unclonable
+	 * mounts, under a shared mount
+	 */
+	if (IS_MNT_SHARED(nd->mnt) &&
+		tree_contains_unclone(old_nd.mnt))
+		goto out2;
+
 	err = -ELOOP;
 	for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
 		if (p == old_nd.mnt)
 			goto out2;
 	err = 0;
 
-	detach_mnt(old_nd.mnt, &parent_nd);
-	attach_mnt(old_nd.mnt, nd);
+	detach_recursive_mnt(old_nd.mnt, &parent_nd);
+	spin_unlock(&vfsmount_lock);
+	if ((err = attach_recursive_mnt(old_nd.mnt, nd, 1))) {
+		undo_detach_recursive_mnt(old_nd.mnt, &parent_nd);
+		goto out1;
+	}
+	spin_lock(&vfsmount_lock);
+	mntput(old_nd.mnt);
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
@@ -1949,6 +2065,16 @@ asmlinkage long sys_pivot_root(const cha
 		goto out2; /* not a mountpoint */
 	if (new_nd.mnt->mnt_root != new_nd.dentry)
 		goto out2; /* not a mountpoint */
+	/*
+	 * Don't move a mount in a shared parent.
+	 */
+	if(user_nd.mnt->mnt_parent &&
+		IS_MNT_SHARED(user_nd.mnt->mnt_parent))
+		goto out2;
+	if(new_nd.mnt->mnt_parent &&
+		IS_MNT_SHARED(new_nd.mnt->mnt_parent))
+		goto out2;
+
 	tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
 	spin_lock(&vfsmount_lock);
 	if (tmp != new_nd.mnt) {
@@ -1963,10 +2089,30 @@ asmlinkage long sys_pivot_root(const cha
 			goto out3;
 	} else if (!is_subdir(old_nd.dentry, new_nd.dentry))
 		goto out3;
-	detach_mnt(new_nd.mnt, &parent_nd);
-	detach_mnt(user_nd.mnt, &root_parent);
-	attach_mnt(user_nd.mnt, &old_nd);     /* mount old root on put_old */
-	attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
+
+	detach_recursive_mnt(user_nd.mnt, &root_parent);
+	detach_recursive_mnt(new_nd.mnt, &parent_nd);
+
+	spin_unlock(&vfsmount_lock);
+ 	if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
+		spin_lock(&vfsmount_lock);
+		undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
+		undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
+		goto out3;
+	}
+	spin_lock(&vfsmount_lock);
+ 	mntput(user_nd.mnt);
+
+	spin_unlock(&vfsmount_lock);
+ 	if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
+		spin_lock(&vfsmount_lock);
+		undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
+		undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
+		goto out3;
+	}
+	spin_lock(&vfsmount_lock);
+ 	mntput(new_nd.mnt);
+
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd, &new_nd);
 	security_sb_post_pivotroot(&user_nd, &new_nd);
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -29,6 +29,8 @@
 #define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
 #define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
 #define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
+#define GET_MNT_TYPE(mnt) (mnt->mnt_flags & MNT_PNODE_MASK)
+#define SET_MNT_TYPE(mnt, type) (mnt->mnt_flags |= (type & MNT_PNODE_MASK))
 
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
 #define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
                   ` (3 preceding siblings ...)
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 5/7] shared subtree
Content-Type: text/x-patch; name=umount.patch
Content-Disposition: inline; filename=umount.patch

Adds ability to unmount a shared/slave/unclone/private tree

RP

Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c        |   76 ++++++++++++++++++++++++++++++++++++++++----------
 fs/pnode.c            |   66 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h    |    3 +
 include/linux/pnode.h |    9 ++++-
 4 files changed, 138 insertions(+), 16 deletions(-)

Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -666,3 +666,69 @@ int pnode_abort_mount(struct vfspnode *p
 			NULL, (void *)NULL, NULL, NULL,
 			vfs_abort_mount_func, exception_mnt);
 }
+
+static int vfs_busy(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	struct dentry *dentry = va_arg(args, struct dentry *);
+	struct dentry *rootdentry = va_arg(args, struct dentry *);
+	struct vfsmount *origmnt = va_arg(args, struct vfsmount *);
+	struct vfsmount *child_mnt;
+	int ret=0;
+
+	spin_unlock(&vfsmount_lock);
+	child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+	spin_lock(&vfsmount_lock);
+
+	if (!child_mnt)
+		return 0;
+
+	if (list_empty(&child_mnt->mnt_mounts)) {
+		if (origmnt == child_mnt)
+			ret = do_refcount_check(child_mnt, 3);
+		else
+			ret = do_refcount_check(child_mnt, 2);
+	}
+	mntput(child_mnt);
+	return ret;
+}
+
+int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
+		struct dentry *root, struct vfsmount *mnt)
+{
+	return pnode_traverse(pnode, NULL, NULL,
+			NULL, NULL, vfs_busy, mntpt, root, mnt);
+}
+
+
+int vfs_umount(struct vfsmount *mnt, enum pnode_vfs_type flag,
+		void *indata, va_list args)
+{
+	struct vfsmount *child_mnt;
+	struct dentry *dentry, *rootdentry;
+
+
+	dentry = va_arg(args, struct dentry *);
+	rootdentry = va_arg(args, struct dentry *);
+
+	spin_unlock(&vfsmount_lock);
+	child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+	spin_lock(&vfsmount_lock);
+	mntput(child_mnt);
+	if (child_mnt && list_empty(&child_mnt->mnt_mounts)) {
+		if (IS_MNT_SHARED(child_mnt) ||
+				IS_MNT_SLAVE(child_mnt)) {
+			BUG_ON(!child_mnt->mnt_pnode);
+			pnode_disassociate_mnt(child_mnt);
+		}
+		do_detach_mount(child_mnt);
+	}
+	return 0;
+}
+
+int pnode_umount(struct vfspnode *pnode, struct dentry *dentry,
+			struct dentry *rootdentry)
+{
+	return pnode_traverse(pnode, NULL, (void *)NULL,
+			NULL, NULL, vfs_umount, dentry, rootdentry);
+}
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -395,6 +395,20 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
+int mount_busy(struct vfsmount *mnt)
+{
+	struct vfspnode *parent_pnode;
+
+	if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
+		return do_refcount_check(mnt, 2);
+
+	parent_pnode = mnt->mnt_parent->mnt_pnode;
+	BUG_ON(!parent_pnode);
+	return pnode_mount_busy(parent_pnode,
+			mnt->mnt_mountpoint,
+			mnt->mnt_root, mnt);
+}
+
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
@@ -410,14 +424,28 @@ EXPORT_SYMBOL(may_umount_tree);
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (atomic_read(&mnt->mnt_count) > 2)
+	if (mount_busy(mnt))
 		return -EBUSY;
 	return 0;
 }
 
 EXPORT_SYMBOL(may_umount);
 
-void umount_tree(struct vfsmount *mnt)
+void do_detach_mount(struct vfsmount *mnt)
+{
+	struct nameidata old_nd;
+	if (mnt != mnt->mnt_parent) {
+		detach_mnt(mnt, &old_nd);
+		path_release(&old_nd);
+	}
+	list_del_init(&mnt->mnt_list);
+	list_del_init(&mnt->mnt_fslink);
+	spin_unlock(&vfsmount_lock);
+	mntput(mnt);
+	spin_lock(&vfsmount_lock);
+}
+
+void __umount_tree(struct vfsmount *mnt, int propogate)
 {
 	struct vfsmount *p;
 	LIST_HEAD(kill);
@@ -431,20 +459,40 @@ void umount_tree(struct vfsmount *mnt)
 		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
 		list_del_init(&mnt->mnt_list);
 		list_del_init(&mnt->mnt_fslink);
-		if (mnt->mnt_parent == mnt) {
-			spin_unlock(&vfsmount_lock);
+		if (propogate && mnt->mnt_parent != mnt &&
+			IS_MNT_SHARED(mnt->mnt_parent)) {
+			struct vfspnode *parent_pnode
+				= mnt->mnt_parent->mnt_pnode;
+			BUG_ON(!parent_pnode);
+			pnode_umount(parent_pnode,
+				mnt->mnt_mountpoint,
+				mnt->mnt_root);
 		} else {
-			struct nameidata old_nd;
-			detach_mnt(mnt, &old_nd);
-			spin_unlock(&vfsmount_lock);
-			path_release(&old_nd);
+			if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+				BUG_ON(!mnt->mnt_pnode);
+				pnode_disassociate_mnt(mnt);
+			}
+			do_detach_mount(mnt);
 		}
-		mntput(mnt);
-		spin_lock(&vfsmount_lock);
 	}
 }
 
-static int do_umount(struct vfsmount *mnt, int flags)
+void umount_tree(struct vfsmount *mnt)
+{
+	__umount_tree(mnt, 1);
+}
+
+/*
+ * return true if the refcount is greater than count
+ */
+int do_refcount_check(struct vfsmount *mnt, int count)
+{
+
+	int mycount = atomic_read(&mnt->mnt_count);
+	return (mycount > count);
+}
+
+int do_umount(struct vfsmount *mnt, int flags)
 {
 	struct super_block * sb = mnt->mnt_sb;
 	int retval;
@@ -525,7 +573,7 @@ static int do_umount(struct vfsmount *mn
 		spin_lock(&vfsmount_lock);
 	}
 	retval = -EBUSY;
-	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
+	if (flags & MNT_DETACH || !mount_busy(mnt)) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt);
 		retval = 0;
@@ -659,7 +707,7 @@ static struct vfsmount *copy_tree(struct
  Enomem:
 	if (res) {
 		spin_lock(&vfsmount_lock);
-		umount_tree(res);
+		__umount_tree(res, 0);
 		spin_unlock(&vfsmount_lock);
 	}
 	return NULL;
@@ -1341,7 +1389,7 @@ static int do_loopback(struct nameidata 
 		err = graft_tree(mnt, nd);
 		if (err) {
  			spin_lock(&vfsmount_lock);
- 			umount_tree(mnt);
+ 			__umount_tree(mnt, 0);
  			spin_unlock(&vfsmount_lock);
 			/*
 			 * ok we failed! so undo any overlay
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -1216,12 +1216,15 @@ extern struct vfsmount *kern_mount(struc
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern int do_umount(struct vfsmount *, int);
 extern struct vfsmount *do_attach_prepare_mnt(struct vfsmount *,
 		struct dentry *, struct vfsmount *, int);
 extern void do_attach_commit_mnt(struct vfsmount *);
 extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
 extern int do_make_unmounted(struct vfsmount *);
 extern void do_detach_prepare_mnt(struct vfsmount *, int);
+extern void do_detach_mount(struct vfsmount *);
+extern int do_refcount_check(struct vfsmount *, int );
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);
 
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- 2.6.12.work2.orig/include/linux/pnode.h
+++ 2.6.12.work2/include/linux/pnode.h
@@ -63,13 +63,15 @@ put_pnode_locked(struct vfspnode *pnode)
 {
 	if (!pnode)
 		return;
-	if (atomic_dec_and_test(&pnode->pnode_count)) {
+	if (atomic_dec_and_test(&pnode->pnode_count))
 		__put_pnode(pnode);
-	}
 }
 
 void __init pnode_init(unsigned long );
 struct vfspnode * pnode_alloc(void);
+void pnode_free(struct vfspnode *);
+int pnode_is_busy(struct vfspnode *);
+int pnode_umount_vfs(struct vfspnode *, struct dentry *, struct dentry *, int);
 void pnode_add_slave_mnt(struct vfspnode *, struct vfsmount *);
 void pnode_add_member_mnt(struct vfspnode *, struct vfsmount *);
 void pnode_del_slave_mnt(struct vfsmount *);
@@ -87,4 +89,7 @@ int pnode_prepare_mount(struct vfspnode 
 		struct vfsmount *, struct vfsmount *);
 int pnode_commit_mount(struct vfspnode *, int);
 int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
+int pnode_umount(struct vfspnode *, struct dentry *, struct dentry *);
+int pnode_mount_busy(struct vfspnode *, struct dentry *, struct dentry *,
+		struct vfsmount *);
 #endif /* _LINUX_PNODE_H */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
                   ` (4 preceding siblings ...)
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-25 22:44 ` (unknown) Ram Pai
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 6/7] shared subtree
Content-Type: text/x-patch; name=namespace.patch
Content-Disposition: inline; filename=namespace.patch

Adds ability to clone a namespace that has shared/private/slave/unclone
subtrees in it.

RP


Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c |    9 +++++++++
 1 files changed, 9 insertions(+)

Index: 2.6.12-rc6.work1/fs/namespace.c
===================================================================
--- 2.6.12-rc6.work1.orig/fs/namespace.c
+++ 2.6.12-rc6.work1/fs/namespace.c
@@ -1894,6 +1894,13 @@ int copy_namespace(int flags, struct tas
 	q = new_ns->root;
 	while (p) {
 		q->mnt_namespace = new_ns;
+
+		if (IS_MNT_SHARED(q))
+			pnode_add_member_mnt(q->mnt_pnode, q);
+		else if (IS_MNT_SLAVE(q))
+			pnode_add_slave_mnt(q->mnt_pnode, q);
+		put_pnode(q->mnt_pnode);
+
 		if (fs) {
 			if (p == fs->rootmnt) {
 				rootmnt = p;
@@ -2271,6 +2278,8 @@ void __put_namespace(struct namespace *n
 	spin_lock(&vfsmount_lock);
 
 	list_for_each_entry(mnt, &namespace->list, mnt_list) {
+		if (mnt->mnt_pnode)
+			pnode_disassociate_mnt(mnt);
 		mnt->mnt_namespace = NULL;
 	}
 

^ permalink raw reply	[flat|nested] 37+ messages in thread

* (unknown)
  2005-07-25 22:44 (unknown) Ram Pai
                   ` (5 preceding siblings ...)
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
  2005-07-26  2:53 ` supposed to be shared subtree patches Ram Pai
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
  To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison

, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 7/7] shared subtree
Content-Type: text/x-patch; name=automount.patch
Content-Disposition: inline; filename=automount.patch

adds support for mount/umount propogation for autofs initiated operations,
RP

Signed by Ram Pai (linuxram@us.ibm.com)

 fs/namespace.c        |  176 +++++++++++++++++++-------------------------------
 fs/pnode.c            |   12 +--
 include/linux/pnode.h |    3 
 3 files changed, 76 insertions(+), 115 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -202,6 +202,9 @@ struct vfsmount *do_attach_prepare_mnt(s
 		if(!(child_mnt = clone_mnt(template_mnt,
 				template_mnt->mnt_root)))
 			return NULL;
+		spin_lock(&vfsmount_lock);
+		list_del_init(&child_mnt->mnt_fslink);
+		spin_unlock(&vfsmount_lock);
 	} else
 		child_mnt = template_mnt;
 
@@ -355,35 +358,14 @@ struct seq_operations mounts_op = {
  */
 int may_umount_tree(struct vfsmount *mnt)
 {
-	struct list_head *next;
-	struct vfsmount *this_parent = mnt;
-	int actual_refs;
-	int minimum_refs;
+	int actual_refs=0;
+	int minimum_refs=0;
+	struct vfsmount *p;
 
 	spin_lock(&vfsmount_lock);
-	actual_refs = atomic_read(&mnt->mnt_count);
-	minimum_refs = 2;
-repeat:
-	next = this_parent->mnt_mounts.next;
-resume:
-	while (next != &this_parent->mnt_mounts) {
-		struct vfsmount *p = list_entry(next, struct vfsmount, mnt_child);
-
-		next = next->next;
-
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += atomic_read(&p->mnt_count);
 		minimum_refs += 2;
-
-		if (!list_empty(&p->mnt_mounts)) {
-			this_parent = p;
-			goto repeat;
-		}
-	}
-
-	if (this_parent != mnt) {
-		next = this_parent->mnt_child.next;
-		this_parent = this_parent->mnt_parent;
-		goto resume;
 	}
 	spin_unlock(&vfsmount_lock);
 
@@ -395,18 +377,18 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
-int mount_busy(struct vfsmount *mnt)
+int mount_busy(struct vfsmount *mnt, int refcnt)
 {
 	struct vfspnode *parent_pnode;
 
 	if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
-		return do_refcount_check(mnt, 2);
+		return do_refcount_check(mnt, refcnt);
 
 	parent_pnode = mnt->mnt_parent->mnt_pnode;
 	BUG_ON(!parent_pnode);
 	return pnode_mount_busy(parent_pnode,
 			mnt->mnt_mountpoint,
-			mnt->mnt_root, mnt);
+			mnt->mnt_root, mnt, refcnt);
 }
 
 /**
@@ -424,9 +406,12 @@ int mount_busy(struct vfsmount *mnt)
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (mount_busy(mnt))
-		return -EBUSY;
-	return 0;
+	int ret=0;
+	spin_lock(&vfsmount_lock);
+	if (mount_busy(mnt, 2))
+		ret = -EBUSY;
+	spin_unlock(&vfsmount_lock);
+	return ret;
 }
 
 EXPORT_SYMBOL(may_umount);
@@ -445,7 +430,26 @@ void do_detach_mount(struct vfsmount *mn
 	spin_lock(&vfsmount_lock);
 }
 
-void __umount_tree(struct vfsmount *mnt, int propogate)
+void umount_mnt(struct vfsmount *mnt, int propogate)
+{
+	if (propogate && mnt->mnt_parent != mnt &&
+		IS_MNT_SHARED(mnt->mnt_parent)) {
+		struct vfspnode *parent_pnode
+			= mnt->mnt_parent->mnt_pnode;
+		BUG_ON(!parent_pnode);
+		pnode_umount(parent_pnode,
+			mnt->mnt_mountpoint,
+			mnt->mnt_root);
+	} else {
+		if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+			BUG_ON(!mnt->mnt_pnode);
+			pnode_disassociate_mnt(mnt);
+		}
+		do_detach_mount(mnt);
+	}
+}
+
+static void __umount_tree(struct vfsmount *mnt, int propogate)
 {
 	struct vfsmount *p;
 	LIST_HEAD(kill);
@@ -459,21 +463,7 @@ void __umount_tree(struct vfsmount *mnt,
 		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
 		list_del_init(&mnt->mnt_list);
 		list_del_init(&mnt->mnt_fslink);
-		if (propogate && mnt->mnt_parent != mnt &&
-			IS_MNT_SHARED(mnt->mnt_parent)) {
-			struct vfspnode *parent_pnode
-				= mnt->mnt_parent->mnt_pnode;
-			BUG_ON(!parent_pnode);
-			pnode_umount(parent_pnode,
-				mnt->mnt_mountpoint,
-				mnt->mnt_root);
-		} else {
-			if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
-				BUG_ON(!mnt->mnt_pnode);
-				pnode_disassociate_mnt(mnt);
-			}
-			do_detach_mount(mnt);
-		}
+		umount_mnt(mnt, propogate);
 	}
 }
 
@@ -573,7 +563,7 @@ int do_umount(struct vfsmount *mnt, int 
 		spin_lock(&vfsmount_lock);
 	}
 	retval = -EBUSY;
-	if (flags & MNT_DETACH || !mount_busy(mnt)) {
+	if (flags & MNT_DETACH || !mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt);
 		retval = 0;
@@ -755,8 +745,11 @@ static void commit_attach_recursive_mnt(
 
 			if (slave_flag)
 				pnode_add_slave_pnode(master_pnode, tmp_pnode);
-			else
+			else {
+				spin_lock(&vfspnode_lock);
 				pnode_merge_pnode(tmp_pnode, master_pnode);
+				spin_unlock(&vfspnode_lock);
+			}
 
 			/*
 			 * we don't need the extra reference to
@@ -820,7 +813,6 @@ static void abort_attach_recursive_mnt(s
 	list_del_init(head);
 }
 
-
  /*
  *  @source_mnt : mount tree to be attached
  *  @nd		: place the mount tree @source_mnt is attached
@@ -1518,8 +1510,9 @@ static int do_move_mount(struct nameidat
 	detach_recursive_mnt(old_nd.mnt, &parent_nd);
 	spin_unlock(&vfsmount_lock);
 	if ((err = attach_recursive_mnt(old_nd.mnt, nd, 1))) {
+		spin_lock(&vfsmount_lock);
 		undo_detach_recursive_mnt(old_nd.mnt, &parent_nd);
-		goto out1;
+		goto out2;
 	}
 	spin_lock(&vfsmount_lock);
 	mntput(old_nd.mnt);
@@ -1621,6 +1614,8 @@ void mark_mounts_for_expiry(struct list_
 	if (list_empty(mounts))
 		return;
 
+	down_write(&namespace_sem);
+
 	spin_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
@@ -1630,8 +1625,7 @@ void mark_mounts_for_expiry(struct list_
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) {
-		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
-		    atomic_read(&mnt->mnt_count) != 1)
+		if (!xchg(&mnt->mnt_expiry_mark, 1) || mount_busy(mnt, 1))
 			continue;
 
 		mntget(mnt);
@@ -1639,12 +1633,13 @@ void mark_mounts_for_expiry(struct list_
 	}
 
 	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
+	 * go through the vfsmounts we've just consigned to the graveyard
 	 * - delete the vfsmount from the appropriate namespace under lock
 	 * - dispose of the corpse
 	 */
 	while (!list_empty(&graveyard)) {
+		struct super_block *sb;
+
 		mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
 		list_del_init(&mnt->mnt_fslink);
 
@@ -1655,60 +1650,25 @@ void mark_mounts_for_expiry(struct list_
 			continue;
 		get_namespace(namespace);
 
-		spin_unlock(&vfsmount_lock);
-		down_write(&namespace_sem);
-		spin_lock(&vfsmount_lock);
-
-		/* check that it is still dead: the count should now be 2 - as
-		 * contributed by the vfsmount parent and the mntget above */
-		if (atomic_read(&mnt->mnt_count) == 2) {
-			struct vfsmount *xdmnt;
-			struct dentry *xdentry;
-
-			/* delete from the namespace */
-			list_del_init(&mnt->mnt_list);
-			list_del_init(&mnt->mnt_child);
-			list_del_init(&mnt->mnt_hash);
-			mnt->mnt_mountpoint->d_mounted--;
-
-			xdentry = mnt->mnt_mountpoint;
-			mnt->mnt_mountpoint = mnt->mnt_root;
-			xdmnt = mnt->mnt_parent;
-			mnt->mnt_parent = mnt;
-
-			spin_unlock(&vfsmount_lock);
-
-			mntput(xdmnt);
-			dput(xdentry);
-
-			/* now lay it to rest if this was the last ref on the
-			 * superblock */
-			if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
-				/* last instance - try to be smart */
-				lock_kernel();
-				DQUOT_OFF(mnt->mnt_sb);
-				acct_auto_close(mnt->mnt_sb);
-				unlock_kernel();
-			}
-
-			mntput(mnt);
-		} else {
-			/* someone brought it back to life whilst we didn't
-			 * have any locks held so return it to the expiration
-			 * list */
-			list_add_tail(&mnt->mnt_fslink, mounts);
-			spin_unlock(&vfsmount_lock);
+		sb = mnt->mnt_sb;
+		umount_mnt(mnt, 1);
+		/*
+		 * now lay it to rest if this was the last ref on the
+		 * superblock
+		 */
+		if (atomic_read(&sb->s_active) == 1) {
+			/* last instance - try to be smart */
+			lock_kernel();
+			DQUOT_OFF(sb);
+			acct_auto_close(sb);
+			unlock_kernel();
 		}
-
-		up_write(&namespace_sem);
-
 		mntput(mnt);
-		put_namespace(namespace);
 
-		spin_lock(&vfsmount_lock);
+		put_namespace(namespace);
 	}
-
 	spin_unlock(&vfsmount_lock);
+	up_write(&namespace_sem);
 }
 
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -2149,24 +2109,24 @@ asmlinkage long sys_pivot_root(const cha
 	detach_recursive_mnt(new_nd.mnt, &parent_nd);
 
 	spin_unlock(&vfsmount_lock);
- 	if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
+ 	if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
 		spin_lock(&vfsmount_lock);
 		undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
 		undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
 		goto out3;
 	}
 	spin_lock(&vfsmount_lock);
- 	mntput(user_nd.mnt);
+ 	mntput(new_nd.mnt);
 
 	spin_unlock(&vfsmount_lock);
- 	if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
+ 	if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
 		spin_lock(&vfsmount_lock);
 		undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
 		undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
 		goto out3;
 	}
 	spin_lock(&vfsmount_lock);
- 	mntput(new_nd.mnt);
+ 	mntput(user_nd.mnt);
 
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd, &new_nd);
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -29,7 +29,7 @@
 static kmem_cache_t * pnode_cachep;
 
 /* spinlock for pnode related operations */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
 
 enum pnode_vfs_type {
 	PNODE_MEMBER_VFS = 0x01,
@@ -673,6 +673,7 @@ static int vfs_busy(struct vfsmount *mnt
 	struct dentry *dentry = va_arg(args, struct dentry *);
 	struct dentry *rootdentry = va_arg(args, struct dentry *);
 	struct vfsmount *origmnt = va_arg(args, struct vfsmount *);
+	int    refcnt = va_arg(args, int);
 	struct vfsmount *child_mnt;
 	int ret=0;
 
@@ -685,22 +686,21 @@ static int vfs_busy(struct vfsmount *mnt
 
 	if (list_empty(&child_mnt->mnt_mounts)) {
 		if (origmnt == child_mnt)
-			ret = do_refcount_check(child_mnt, 3);
+			ret = do_refcount_check(child_mnt, refcnt+1);
 		else
-			ret = do_refcount_check(child_mnt, 2);
+			ret = do_refcount_check(child_mnt, refcnt);
 	}
 	mntput(child_mnt);
 	return ret;
 }
 
 int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
-		struct dentry *root, struct vfsmount *mnt)
+		struct dentry *root, struct vfsmount *mnt, int refcnt)
 {
 	return pnode_traverse(pnode, NULL, NULL,
-			NULL, NULL, vfs_busy, mntpt, root, mnt);
+			NULL, NULL, vfs_busy, mntpt, root, mnt, refcnt);
 }
 
-
 int vfs_umount(struct vfsmount *mnt, enum pnode_vfs_type flag,
 		void *indata, va_list args)
 {
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- 2.6.12.work2.orig/include/linux/pnode.h
+++ 2.6.12.work2/include/linux/pnode.h
@@ -77,6 +77,7 @@ void pnode_add_member_mnt(struct vfspnod
 void pnode_del_slave_mnt(struct vfsmount *);
 void pnode_del_member_mnt(struct vfsmount *);
 void pnode_disassociate_mnt(struct vfsmount *);
+void pnode_member_to_slave(struct vfsmount *);
 void pnode_add_slave_pnode(struct vfspnode *, struct vfspnode *);
 struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
 		struct dentry *);
@@ -91,5 +92,5 @@ int pnode_commit_mount(struct vfspnode *
 int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
 int pnode_umount(struct vfspnode *, struct dentry *, struct dentry *);
 int pnode_mount_busy(struct vfspnode *, struct dentry *, struct dentry *,
-		struct vfsmount *);
+		struct vfsmount *, int);
 #endif /* _LINUX_PNODE_H */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: supposed to be shared subtree patches.
  2005-07-25 22:44 (unknown) Ram Pai
                   ` (6 preceding siblings ...)
  2005-07-25 22:44 ` (unknown) Ram Pai
@ 2005-07-26  2:53 ` Ram Pai
       [not found] ` <20050725225908.031752000@localhost>
       [not found] ` <20050725225907.007405000@localhost>
  9 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-26  2:53 UTC (permalink / raw)
  To: linux-kernel, linux-fsdevel
  Cc: akpm, Al Viro, Avantika Mathur, Mike Waychison

On Mon, 2005-07-25 at 15:44, Ram Pai wrote:
> , miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
> Subject: [PATCH 0/7] shared subtree
> 
> Hi Andrew/Al Viro,
> 
> 	Enclosing a final set of well tested patches that implement

....my apologies. I screwed up sending the patches through quilt.

anyway I have received the following comments from Andrew Morton, which
I will incorporate before sending out saner looking patches.
sorry again,
RP

Andrew's comments follows:
----------------------------------------

Frankly, I don't even know what these patches _do_, and haven't spent
the time to try to find out.

If these patches are merged, how do we expect end-users to find out how
to use the new capabilities?

A few paragraphs in the patch #1 changelog would help.  A high-level
description of the new capability which explains what it does and why it
would be a useful thing for Linux.

And maybe some deeper information in a Documentation/ file.

Right now, there might well be a lot of people who could use these new
features, but they don't even know that these patches provide them! 
It's all a bit of a mystery, really.
---------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/7] shared subtree
       [not found] ` <20050725225908.031752000@localhost>
@ 2005-07-27 19:13   ` Miklos Szeredi
  2005-07-27 20:30     ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-27 19:13 UTC (permalink / raw)
  To: linuxram; +Cc: mathurav, mike, janak, linux-fsdevel, linux-kernel

> @@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
>  
>  struct vfsmount *alloc_vfsmnt(const char *name)
>  {
> -	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
> +	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
>  	if (mnt) {
>  		memset(mnt, 0, sizeof(struct vfsmount));
>  		atomic_set(&mnt->mnt_count,1);

Please make whitespace changes a separate patch.

> @@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
>  {
>  	mnt->mnt_parent = mntget(nd->mnt);
>  	mnt->mnt_mountpoint = dget(nd->dentry);
> -	list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
> +	mnt->mnt_namespace = nd->mnt->mnt_namespace;
> +	list_add_tail(&mnt->mnt_hash,
> +			mount_hashtable+hash(nd->mnt, nd->dentry));
>  	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
>  	nd->dentry->d_mounted++;
>  }

Why list_add_tail()?  This changes user visible behavior, and seems
unnecessary.

> +static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
> +{
> +	mnt->mnt_parent = mntget(nd->mnt);
> +	mnt->mnt_mountpoint = dget(nd->dentry);
> +	nd->dentry->d_mounted++;
> +}
> +
> +

You shouldn't add unnecessary newlines.  There are a lot of these,
please audit all your patches.

> +void do_attach_commit_mnt(struct vfsmount *mnt)
> +{
> +	struct vfsmount *parent = mnt->mnt_parent;
> +	BUG_ON(parent==mnt);

	BUG_ON(parent == mnt);

> +	if(list_empty(&mnt->mnt_hash))

	if (list_empty(&mnt->mnt_hash))

> +		list_add_tail(&mnt->mnt_hash,
> +			mount_hashtable+hash(parent, mnt->mnt_mountpoint));
> +	if(list_empty(&mnt->mnt_child))
> +		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> +	mnt->mnt_namespace = parent->mnt_namespace;
> +	list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
> +}

Etc.  Maybe you should run Lindent on your changes, but be careful not
to change existing code, even if Lindent would do that!

> @@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
>  	struct list_head *p;
>  	loff_t l = *pos;
>  
> -	down_read(&n->sem);
> +	down_read(&namespace_sem);
>  	list_for_each(p, &n->list)
>  		if (!l--)
>  			return list_entry(p, struct vfsmount, mnt_list);

This should be a separate patch.  You can just take the one from the
detached trees patch-series.

> +/*
> + * abort the operations done in attach_recursive_mnt(). run through the mount
> + * tree, till vfsmount 'last' and undo the changes.  Ensure that all the mounts
> + * in the tree are all back in the mnt_list headed at 'source_mnt'.
> + * NOTE: This function is closely tied to the logic in
> + * 'attach_recursive_mnt()'
> + */
> +static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
> +		vfsmount *last, struct list_head *head) { struct vfsmount *p =
> +	source_mnt, *m; struct vfspnode *src_pnode;

If you want to do proper error handling, instead of doing rollback, it
seems better to first do anything that can fail (allocations), then do
the actual attaching, which cannot fail.  It isn't nice to have
transient states on failure.

> + /*
> + * This operation is equivalent of mount --bind dir dir
> + * create a new mount at the dentry, and unmount all child mounts
> + * mounted on top of dentries below 'dentry', and mount them
> + * under the new mount.
> +  */
> +struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)

Why is this needed?  I thought we agreed, that this can be removed.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
       [not found] ` <20050725225907.007405000@localhost>
@ 2005-07-27 19:54   ` Miklos Szeredi
  2005-07-27 21:39     ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-27 19:54 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel

> +static int do_make_shared(struct vfsmount *mnt)
> +{
> +	int err=0;
> +	struct vfspnode *old_pnode = NULL;
> +	/*
> +	 * if the mount is already a slave mount,
> +	 * allocate a new pnode and make it
> +	 * a slave pnode of the original pnode.
> +	 */
> +	if (IS_MNT_SLAVE(mnt)) {
> +		old_pnode = mnt->mnt_pnode;
> +		pnode_del_slave_mnt(mnt);
> +	}
> +	if(!IS_MNT_SHARED(mnt)) {
> +		mnt->mnt_pnode = pnode_alloc();
> +		if(!mnt->mnt_pnode) {
> +			pnode_add_slave_mnt(old_pnode, mnt);
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +		pnode_add_member_mnt(mnt->mnt_pnode, mnt);
> +	}
> +	if(old_pnode)
> +		pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
> +	set_mnt_shared(mnt);
> +out:
> +	return err;
> +}

This is an example, where having struct pnode just complicates things.
If there was no struct pnode, this function would be just one line:
setting the shared flag.

> +static kmem_cache_t * pnode_cachep;
> +
> +/* spinlock for pnode related operations */
> + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> +
> +enum pnode_vfs_type {
> +	PNODE_MEMBER_VFS = 0x01,
> +	PNODE_SLAVE_VFS = 0x02
> +};
> +
> +void __init pnode_init(unsigned long mempages)
> +{
> +	pnode_cachep = kmem_cache_create("pnode_cache",
> +                       sizeof(struct vfspnode), 0,
> +                       SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> +}
> +
> +struct vfspnode * pnode_alloc(void)
> +{
> +	struct vfspnode *pnode =  kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
> +	INIT_LIST_HEAD(&pnode->pnode_vfs);
> +	INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> +	INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> +	INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> +	pnode->pnode_master = NULL;
> +	pnode->pnode_flags = 0;
> +	atomic_set(&pnode->pnode_count,0);
> +	return pnode;
> +}
> +
> +void inline pnode_free(struct vfspnode *pnode)
> +{
> +	kmem_cache_free(pnode_cachep, pnode);
> +}
> +
> +/*
> + * __put_pnode() should be called with vfspnode_lock held
> + */
> +void __put_pnode(struct vfspnode *pnode)
> +{
> +	struct vfspnode *tmp_pnode;
> +	do {
> +		tmp_pnode = pnode->pnode_master;
> +		list_del_init(&pnode->pnode_peer_slave);
> +		BUG_ON(!list_empty(&pnode->pnode_vfs));
> +		BUG_ON(!list_empty(&pnode->pnode_slavevfs));
> +		BUG_ON(!list_empty(&pnode->pnode_slavepnode));
> +		pnode_free(pnode);
> +		pnode = tmp_pnode;
> +		if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
> +			break;
> +	} while(pnode);
> +}
> +

All these are really unnecessary IMO.

> +/*
> + * merge 'pnode' into 'peer_pnode' and get rid of pnode
> + * @pnode: pnode the contents of which have to be merged
> + * @peer_pnode: pnode into which the contents are merged
> + */
> +int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
> +{
> +	struct vfspnode *slave_pnode, *pnext;
> +	struct vfsmount *mnt, *slave_mnt, *next;
> +
> +	list_for_each_entry_safe(slave_pnode,  pnext,
> +			&pnode->pnode_slavepnode, pnode_peer_slave) {
> +		slave_pnode->pnode_master = peer_pnode;
> +		list_move(&slave_pnode->pnode_peer_slave,
> +				&peer_pnode->pnode_slavepnode);
> +		put_pnode_locked(pnode);
> +		get_pnode(peer_pnode);
> +	}
> +
> +	list_for_each_entry_safe(slave_mnt,  next,
> +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> +		slave_mnt->mnt_pnode = peer_pnode;
> +		list_move(&slave_mnt->mnt_pnode_mntlist,
> +				&peer_pnode->pnode_slavevfs);
> +		put_pnode_locked(pnode);
> +		get_pnode(peer_pnode);
> +	}
> +
> +	list_for_each_entry_safe(mnt, next,
> +			&pnode->pnode_vfs, mnt_pnode_mntlist) {
> +		mnt->mnt_pnode = peer_pnode;
> +		list_move(&mnt->mnt_pnode_mntlist,
> +				&peer_pnode->pnode_vfs);
> +		put_pnode_locked(pnode);
> +		get_pnode(peer_pnode);
> +	}
> +	return 0;
> +}

Much overcomplication.  It would just be a list_splice(), if there was
no struct pnode.


> +static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
> +	*next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
> +
> +	if ((master_pnode = pnode->pnode_master)) {
> +		pnode->pnode_master = NULL;
> +		list_del_init(&pnode->pnode_peer_slave);
> +		pnode_merge_pnode(pnode, master_pnode);
> +		put_pnode_locked(master_pnode);
> +	} else {
> +		list_for_each_entry_safe(slave_mnt, next,
> +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> +			list_del_init(&slave_mnt->mnt_pnode_mntlist);
> +			set_mnt_private(slave_mnt);
> +			put_pnode_locked(pnode);
> +		}
> +		list_for_each_entry_safe(slave_pnode,  pnext,
> +			&pnode->pnode_slavepnode, pnode_peer_slave) {
> +			slave_pnode->pnode_master = NULL;
> +			list_del_init(&slave_pnode->pnode_peer_slave);
> +			put_pnode_locked(pnode);
> +		}
> +	}
> +}
> +

Unnecessary.

> +static int pnode_next(struct pcontext *context)
> +{
> +	struct vfspnode *pnode = context->pnode;
> +	struct vfspnode	*master_pnode=context->master_pnode;
> +	struct list_head *next;
> +
> +	if (!pnode) {
> +		BUG_ON(!context->start);
> +		get_pnode(context->start);
> +		context->pnode = context->start;
> +		context->master_pnode = NULL;
> +		context->level = 0;
> +		return 1;
> +	}
> +
> +	spin_lock(&vfspnode_lock);
> +	next = pnode->pnode_slavepnode.next;
> +	if (next == &pnode->pnode_slavepnode) {
> +		while (1) {
> +			int flag;
> +
> +			if (pnode == context->start) {
> +				put_pnode_locked(pnode);
> +				spin_unlock(&vfspnode_lock);
> +				BUG_ON(context->level != 0);
> +				return 0;
> +			}
> +
> +			next = pnode->pnode_peer_slave.next;
> +			flag = (next != &pnode->pnode_master->pnode_slavepnode);
> +			put_pnode_locked(pnode);
> +
> +			if (flag)
> +				break;
> +
> +			pnode = master_pnode;
> +			master_pnode = pnode->pnode_master;
> +			context->level--;
> +		}
> +	} else {
> +		master_pnode = pnode;
> +		context->level++;
> +	}
> +
> +	pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
> +	get_pnode(pnode);
> +
> +	context->pnode = pnode;
> +	context->master_pnode = master_pnode;
> +	spin_unlock(&vfspnode_lock);
> +	return 1;
> +}
> +
> +/*
> + * skip the rest of the tree, cleaning up
> + * reference to pnodes held in pnode_next().
> + */
> +static void pnode_end(struct pcontext *context)
> +{
> +	struct vfspnode *p = context->pnode;
> +	struct vfspnode *start = context->start;
> +
> +	do {
> +		put_pnode(p);
> +	} while (p != start && (p = p->pnode_master));
> +	return;
> +}
> +
> +/*
> + * traverse the pnode tree and at each pnode encountered, execute the
> + * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
> + *
> + * @pnode: pnode tree to be traversed
> + * @in_data: input data
> + * @out_data: output data
> + * @pnode_func: function to be called when a new pnode is encountered.
> + * @vfs_func: function to be called on each slave and member vfs belonging
> + * 		to the pnode.
> + */
> +static int pnode_traverse(struct vfspnode *pnode,
> +		void *in_data,
> +		void **out_data,
> +		int (*pnode_pre_func)(struct vfspnode *,
> +			void *, void **, va_list),
> +		int (*pnode_post_func)(struct vfspnode *,
> +			void *, va_list),
> +		int (*vfs_func)(struct vfsmount *,
> +			enum pnode_vfs_type, void *,  va_list),
> +		...)
> +{
> +	va_list args;
> +	int ret = 0, level;
> +	void *my_data, *data_from_master;
> +     	struct vfspnode *master_pnode;
> +     	struct vfsmount *slave_mnt, *member_mnt, *t_m;
> +	struct pcontext context;
> +	static void *p_array[PNODE_MAX_SLAVE_LEVEL];
> +
> +	context.start = pnode;
> +	context.pnode = NULL;
> +	/*
> +	 * determine whether to process vfs first or the
> +	 * slave pnode first
> +	 */
> +	while (pnode_next(&context)) {
> +		level = context.level;
> +
> +		if (level >= PNODE_MAX_SLAVE_LEVEL)
> +			goto error;
> +
> +		pnode = context.pnode;
> +		master_pnode = context.master_pnode;
> +
> +		if (master_pnode) {
> +			data_from_master = p_array[level-1];
> +			my_data = NULL;
> +		} else {
> +			data_from_master = NULL;
> +			my_data = in_data;
> +		}
> +
> +		if (pnode_pre_func) {
> +			va_start(args, vfs_func);
> +			if((ret = pnode_pre_func(pnode,
> +				data_from_master, &my_data, args)))
> +				goto error;
> +			va_end(args);
> +		}
> +
> +		// traverse member vfsmounts
> +		spin_lock(&vfspnode_lock);
> +		list_for_each_entry_safe(member_mnt,
> +			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
> +
> +			spin_unlock(&vfspnode_lock);
> +			va_start(args, vfs_func);
> +			if ((ret = vfs_func(member_mnt,
> +				PNODE_MEMBER_VFS, my_data, args)))
> +				goto error;
> +			va_end(args);
> +			spin_lock(&vfspnode_lock);
> +		}
> +		list_for_each_entry_safe(slave_mnt, t_m,
> +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> +
> +			spin_unlock(&vfspnode_lock);
> +			va_start(args, vfs_func);
> +			if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
> +				my_data, args)))
> +				goto error;
> +			va_end(args);
> +			spin_lock(&vfspnode_lock);
> +		}
> +		spin_unlock(&vfspnode_lock);
> +
> +		if (pnode_post_func) {
> +			va_start(args, vfs_func);
> +			if((ret = pnode_post_func(pnode,
> +				my_data, args)))
> +				goto error;
> +			va_end(args);
> +		}
> +
> +		p_array[level] = my_data;
> +	}
> +out:
> +	if (out_data)
> +		*out_data = p_array[0];
> +	return ret;
> +error:
> +	va_end(args);
> +	pnode_end(&context);
> +	goto out;
> +}
> 

And this is the worst part.  As I said earlier, void pointers and
variable argument functions have no place in this kind of code.

I think you could get rid of all these if you'd implement a simple
iterator function which returns the traversed vfsmounts.  That's
another big argument for getting rid of struct pnode: you could do
iteration simply by holding onto a vfsmount pointer, instead of having
to do a two level iteration, once over pnodes, then over vfsmounts.

> +extern spinlock_t vfspnode_lock;
> +extern void __put_pnode(struct vfspnode *);
> +
> +static inline struct vfspnode *
> +get_pnode(struct vfspnode *pnode)
> +{
> +	if (!pnode)
> +		return NULL;
> +	atomic_inc(&pnode->pnode_count);
> +	return pnode;
> +}
> +
> +static inline void
> +put_pnode(struct vfspnode *pnode)
> +{
> +	if (!pnode)
> +		return;
> +	if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
> +		__put_pnode(pnode);
> +		spin_unlock(&vfspnode_lock);
> +	}
> +}

Unnecessary.

> +#define MNT_PRIVATE	0x10  /* if the vfsmount is private, by default it is private*/

If by default it's private, why is this flag needed?

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/7] shared subtree
  2005-07-27 19:13   ` [PATCH 3/7] shared subtree Miklos Szeredi
@ 2005-07-27 20:30     ` Ram Pai
  2005-07-28  8:34       ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-27 20:30 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Avantika Mathur, mike, janak, linux-fsdevel, linux-kernel

On Wed, 2005-07-27 at 12:13, Miklos Szeredi wrote:
> > @@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
> >  
> >  struct vfsmount *alloc_vfsmnt(const char *name)
> >  {
> > -	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
> > +	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> >  	if (mnt) {
> >  		memset(mnt, 0, sizeof(struct vfsmount));
> >  		atomic_set(&mnt->mnt_count,1);
> 
> Please make whitespace changes a separate patch.

 I tried to remove trailing whitespaces in the current code
whereever I found them. Ok will them a separate patch.


> 
> > @@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
> >  {
> >  	mnt->mnt_parent = mntget(nd->mnt);
> >  	mnt->mnt_mountpoint = dget(nd->dentry);
> > -	list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
> > +	mnt->mnt_namespace = nd->mnt->mnt_namespace;
> > +	list_add_tail(&mnt->mnt_hash,
> > +			mount_hashtable+hash(nd->mnt, nd->dentry));
> >  	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
> >  	nd->dentry->d_mounted++;
> >  }
> 
> Why list_add_tail()?  This changes user visible behavior, and seems
> unnecessary.

Yes. I was about to send out a mail questioning the existing behavior. I
will start a seperate thread questioning the current behavoir. My plan
was to discuss the current behavior before making this change. I thought
I had reverted this change. But it slipped in. 

> 
> > +static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
> > +{
> > +	mnt->mnt_parent = mntget(nd->mnt);
> > +	mnt->mnt_mountpoint = dget(nd->dentry);
> > +	nd->dentry->d_mounted++;
> > +}
> > +
> > +
> 
> You shouldn't add unnecessary newlines.  There are a lot of these,
> please audit all your patches.

ok. sure.

> 
> > +void do_attach_commit_mnt(struct vfsmount *mnt)
> > +{
> > +	struct vfsmount *parent = mnt->mnt_parent;
> > +	BUG_ON(parent==mnt);
> 
> 	BUG_ON(parent == mnt);
> 
> > +	if(list_empty(&mnt->mnt_hash))
> 
> 	if (list_empty(&mnt->mnt_hash))
> 
> > +		list_add_tail(&mnt->mnt_hash,
> > +			mount_hashtable+hash(parent, mnt->mnt_mountpoint));
> > +	if(list_empty(&mnt->mnt_child))
> > +		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> > +	mnt->mnt_namespace = parent->mnt_namespace;
> > +	list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
> > +}
> 
> Etc.  Maybe you should run Lindent on your changes, but be careful not
> to change existing code, even if Lindent would do that!

sure :)

> 
> > @@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
> >  	struct list_head *p;
> >  	loff_t l = *pos;
> >  
> > -	down_read(&n->sem);
> > +	down_read(&namespace_sem);
> >  	list_for_each(p, &n->list)
> >  		if (!l--)
> >  			return list_entry(p, struct vfsmount, mnt_list);
> 
> This should be a separate patch.  You can just take the one from the
> detached trees patch-series.

ok. in fact these changes were motivated by that patch.

> 
> > +/*
> > + * abort the operations done in attach_recursive_mnt(). run through the mount
> > + * tree, till vfsmount 'last' and undo the changes.  Ensure that all the mounts
> > + * in the tree are all back in the mnt_list headed at 'source_mnt'.
> > + * NOTE: This function is closely tied to the logic in
> > + * 'attach_recursive_mnt()'
> > + */
> > +static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
> > +		vfsmount *last, struct list_head *head) { struct vfsmount *p =
> > +	source_mnt, *m; struct vfspnode *src_pnode;
> 
> If you want to do proper error handling, instead of doing rollback, it
> seems better to first do anything that can fail (allocations), then do
> the actual attaching, which cannot fail.  It isn't nice to have
> transient states on failure.

yes. it does exactly what you said. In the prepare stage it does not
touch any of the existing vfstree or the pnode tree.

All it does it builds a new vfstree and pnode tree, does the necessary
changes to them. And if everthing is successful, it glues the new tree
to the existing tree (which is the commit phase), and if the prepare
stage fails allocating memory or any other reason, it goes and destroys
the new trees (in the abort phase).

Offcourse in the prepare state, it does increase the reference count of
the vfsmounts to which the new tree will be attached. This is to ensure
that the vfsmounts have not disappeared by the time we reach the commit
phase.  I think we are talking the same thing, and the code behaves
exactly as you said.


> 
> > + /*
> > + * This operation is equivalent of mount --bind dir dir
> > + * create a new mount at the dentry, and unmount all child mounts
> > + * mounted on top of dentries below 'dentry', and mount them
> > + * under the new mount.
> > +  */
> > +struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)
> 
> Why is this needed?  I thought we agreed, that this can be removed.

yes we agreed on returning EINVAL when a directory is attempted to made
shared/private/slave/unclonnable.   But this is a different case.

lets say  /mnt is a mountpoint having a vfsmount (say A). 
now if you run 
	mount --bind /mnt/a  /tmp
this operation succeeds currently. 

now lets say /mnt is a mountpoint having a vfsmount which is shared.
now if you run
	mount --bind /mnt/a /tmp

we now have a mount at /tmp which gets propogation from mounts under
/mnt/a. right?
but /mnt/a is not a mountpoint at all.  if we need to track and
propogate mounts/unmounts under /tmp or /mnt/a we need to have a mount
at /mnt/a. WHich means we have to implicitly make /mnt/a as a
mountpoint. Or the other solution is to return -EINVAL.

But returning -EINVAL is inconsistent with the standard behavior of
	mount --bind 
the standard behavior allows bind mounts from any directory; need not be
a mountpoint. 

We had this exact discussion about the behavior with Mike Waychison, Al
Viro and Bruce Fields. Mike suggested the above implemented behavior.

RP



> 
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-27 19:54   ` [PATCH 1/7] " Miklos Szeredi
@ 2005-07-27 21:39     ` Ram Pai
  2005-07-28  7:35       ` mount behavior question Ram Pai
  2005-07-28  9:57       ` [PATCH 1/7] shared subtree Miklos Szeredi
  0 siblings, 2 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-27 21:39 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
	linux-kernel

On Wed, 2005-07-27 at 12:54, Miklos Szeredi wrote:
> > +static int do_make_shared(struct vfsmount *mnt)
> > +{
> > +	int err=0;
> > +	struct vfspnode *old_pnode = NULL;
> > +	/*
> > +	 * if the mount is already a slave mount,
> > +	 * allocate a new pnode and make it
> > +	 * a slave pnode of the original pnode.
> > +	 */
> > +	if (IS_MNT_SLAVE(mnt)) {
> > +		old_pnode = mnt->mnt_pnode;
> > +		pnode_del_slave_mnt(mnt);
> > +	}
> > +	if(!IS_MNT_SHARED(mnt)) {
> > +		mnt->mnt_pnode = pnode_alloc();
> > +		if(!mnt->mnt_pnode) {
> > +			pnode_add_slave_mnt(old_pnode, mnt);
> > +			err = -ENOMEM;
> > +			goto out;
> > +		}
> > +		pnode_add_member_mnt(mnt->mnt_pnode, mnt);
> > +	}
> > +	if(old_pnode)
> > +		pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
> > +	set_mnt_shared(mnt);
> > +out:
> > +	return err;
> > +}
> 
> This is an example, where having struct pnode just complicates things.
> If there was no struct pnode, this function would be just one line:
> setting the shared flag.
So your comment is mostly about getting rid of pnode and distributing
the pnode functionality in the vfsmount structure.

I know you are thinking of just having the necessary propogation list in
the vfsmount structure itself.  Yes true with that implementation the
complication is reduced in this part of the code, but really complicates
the propogation traversal routines. 

 In order to find out the slaves of a given mount:
with your proposal:  I have to walk through all the peer mounts of this
mount and check for any slaves there.
in my implementation: I have to just find which pnode it belongs to, and
	all the slaves are easily available there.

  In order to find out all the shared mounts that are slave of this
mount: 

with your proposal: Not sure how to do. Maybe you have to have another
		field in each of the vfsmounts that will point to
		the shared mounts that are slave of this mount.??

in my implemenation: I have to just find the pnode it belongs to,
	and all the slave pnodes are easily available there.


There is complexity tradeoffs in both the implementations. But I
personally felt having a pnode structure keeps the pnode operations
seperated out cleanly. It helps to easily visualize the propogation
tree. And also one more thing influenced my thought process. The
statement in Al Viro's RFC: 
-----------------------------------------------------------------------
How do we set them up? 

* we can mark a subtree sharable. Every vfsmount in the subtree 
that is not already in some p-node gets a single-element p-node of its 
own. 
* we can mark a subtree slave. That removes all vfsmounts in 
the subtree from their p-nodes and makes them owned by said p-nodes. 
p-nodes that became empty will disappear and everything they used to 
own will be repossessed by their owners (if any). 
* we can mark a subtree private. Same as above, but followed 
by taking all vfsmounts in our subtree and making them *not* owned 
by anybody. 
--------------------------------------------------------------------
The above statements imply some implementation detail. Not sure if you
will buy this point :)


> 
> > +static kmem_cache_t * pnode_cachep;
> > +
> > +/* spinlock for pnode related operations */
> > + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> > +
> > +enum pnode_vfs_type {
> > +	PNODE_MEMBER_VFS = 0x01,
> > +	PNODE_SLAVE_VFS = 0x02
> > +};
> > +
> > +void __init pnode_init(unsigned long mempages)
> > +{
> > +	pnode_cachep = kmem_cache_create("pnode_cache",
> > +                       sizeof(struct vfspnode), 0,
> > +                       SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> > +}
> > +
> > +struct vfspnode * pnode_alloc(void)
> > +{
> > +	struct vfspnode *pnode =  kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
> > +	INIT_LIST_HEAD(&pnode->pnode_vfs);
> > +	INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> > +	INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> > +	INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> > +	pnode->pnode_master = NULL;
> > +	pnode->pnode_flags = 0;
> > +	atomic_set(&pnode->pnode_count,0);
> > +	return pnode;
> > +}
> > +
> > +void inline pnode_free(struct vfspnode *pnode)
> > +{
> > +	kmem_cache_free(pnode_cachep, pnode);
> > +}
> > +
> > +/*
> > + * __put_pnode() should be called with vfspnode_lock held
> > + */
> > +void __put_pnode(struct vfspnode *pnode)
> > +{
> > +	struct vfspnode *tmp_pnode;
> > +	do {
> > +		tmp_pnode = pnode->pnode_master;
> > +		list_del_init(&pnode->pnode_peer_slave);
> > +		BUG_ON(!list_empty(&pnode->pnode_vfs));
> > +		BUG_ON(!list_empty(&pnode->pnode_slavevfs));
> > +		BUG_ON(!list_empty(&pnode->pnode_slavepnode));
> > +		pnode_free(pnode);
> > +		pnode = tmp_pnode;
> > +		if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
> > +			break;
> > +	} while(pnode);
> > +}
> > +
> 
> All these are really unnecessary IMO.
> 
> > +/*
> > + * merge 'pnode' into 'peer_pnode' and get rid of pnode
> > + * @pnode: pnode the contents of which have to be merged
> > + * @peer_pnode: pnode into which the contents are merged
> > + */
> > +int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
> > +{
> > +	struct vfspnode *slave_pnode, *pnext;
> > +	struct vfsmount *mnt, *slave_mnt, *next;
> > +
> > +	list_for_each_entry_safe(slave_pnode,  pnext,
> > +			&pnode->pnode_slavepnode, pnode_peer_slave) {
> > +		slave_pnode->pnode_master = peer_pnode;
> > +		list_move(&slave_pnode->pnode_peer_slave,
> > +				&peer_pnode->pnode_slavepnode);
> > +		put_pnode_locked(pnode);
> > +		get_pnode(peer_pnode);
> > +	}
> > +
> > +	list_for_each_entry_safe(slave_mnt,  next,
> > +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > +		slave_mnt->mnt_pnode = peer_pnode;
> > +		list_move(&slave_mnt->mnt_pnode_mntlist,
> > +				&peer_pnode->pnode_slavevfs);
> > +		put_pnode_locked(pnode);
> > +		get_pnode(peer_pnode);
> > +	}
> > +
> > +	list_for_each_entry_safe(mnt, next,
> > +			&pnode->pnode_vfs, mnt_pnode_mntlist) {
> > +		mnt->mnt_pnode = peer_pnode;
> > +		list_move(&mnt->mnt_pnode_mntlist,
> > +				&peer_pnode->pnode_vfs);
> > +		put_pnode_locked(pnode);
> > +		get_pnode(peer_pnode);
> > +	}
> > +	return 0;
> > +}
> 
> Much overcomplication.  It would just be a list_splice(), if there was
> no struct pnode.

yes. with your proposal this will be list_splice(). 

> 
> 
> > +static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
> > +	*next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
> > +
> > +	if ((master_pnode = pnode->pnode_master)) {
> > +		pnode->pnode_master = NULL;
> > +		list_del_init(&pnode->pnode_peer_slave);
> > +		pnode_merge_pnode(pnode, master_pnode);
> > +		put_pnode_locked(master_pnode);
> > +	} else {
> > +		list_for_each_entry_safe(slave_mnt, next,
> > +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > +			list_del_init(&slave_mnt->mnt_pnode_mntlist);
> > +			set_mnt_private(slave_mnt);
> > +			put_pnode_locked(pnode);
> > +		}
> > +		list_for_each_entry_safe(slave_pnode,  pnext,
> > +			&pnode->pnode_slavepnode, pnode_peer_slave) {
> > +			slave_pnode->pnode_master = NULL;
> > +			list_del_init(&slave_pnode->pnode_peer_slave);
> > +			put_pnode_locked(pnode);
> > +		}
> > +	}
> > +}
> > +
> 
> Unnecessary.

may be this function can be merged with the functionality in
pnode_merge(). I will look into that.

> 
> > +static int pnode_next(struct pcontext *context)
> > +{
> > +	struct vfspnode *pnode = context->pnode;
> > +	struct vfspnode	*master_pnode=context->master_pnode;
> > +	struct list_head *next;
> > +
> > +	if (!pnode) {
> > +		BUG_ON(!context->start);
> > +		get_pnode(context->start);
> > +		context->pnode = context->start;
> > +		context->master_pnode = NULL;
> > +		context->level = 0;
> > +		return 1;
> > +	}
> > +
> > +	spin_lock(&vfspnode_lock);
> > +	next = pnode->pnode_slavepnode.next;
> > +	if (next == &pnode->pnode_slavepnode) {
> > +		while (1) {
> > +			int flag;
> > +
> > +			if (pnode == context->start) {
> > +				put_pnode_locked(pnode);
> > +				spin_unlock(&vfspnode_lock);
> > +				BUG_ON(context->level != 0);
> > +				return 0;
> > +			}
> > +
> > +			next = pnode->pnode_peer_slave.next;
> > +			flag = (next != &pnode->pnode_master->pnode_slavepnode);
> > +			put_pnode_locked(pnode);
> > +
> > +			if (flag)
> > +				break;
> > +
> > +			pnode = master_pnode;
> > +			master_pnode = pnode->pnode_master;
> > +			context->level--;
> > +		}
> > +	} else {
> > +		master_pnode = pnode;
> > +		context->level++;
> > +	}
> > +
> > +	pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
> > +	get_pnode(pnode);
> > +
> > +	context->pnode = pnode;
> > +	context->master_pnode = master_pnode;
> > +	spin_unlock(&vfspnode_lock);
> > +	return 1;
> > +}
> > +
> > +/*
> > + * skip the rest of the tree, cleaning up
> > + * reference to pnodes held in pnode_next().
> > + */
> > +static void pnode_end(struct pcontext *context)
> > +{
> > +	struct vfspnode *p = context->pnode;
> > +	struct vfspnode *start = context->start;
> > +
> > +	do {
> > +		put_pnode(p);
> > +	} while (p != start && (p = p->pnode_master));
> > +	return;
> > +}
> > +
> > +/*
> > + * traverse the pnode tree and at each pnode encountered, execute the
> > + * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
> > + *
> > + * @pnode: pnode tree to be traversed
> > + * @in_data: input data
> > + * @out_data: output data
> > + * @pnode_func: function to be called when a new pnode is encountered.
> > + * @vfs_func: function to be called on each slave and member vfs belonging
> > + * 		to the pnode.
> > + */
> > +static int pnode_traverse(struct vfspnode *pnode,
> > +		void *in_data,
> > +		void **out_data,
> > +		int (*pnode_pre_func)(struct vfspnode *,
> > +			void *, void **, va_list),
> > +		int (*pnode_post_func)(struct vfspnode *,
> > +			void *, va_list),
> > +		int (*vfs_func)(struct vfsmount *,
> > +			enum pnode_vfs_type, void *,  va_list),
> > +		...)
> > +{
> > +	va_list args;
> > +	int ret = 0, level;
> > +	void *my_data, *data_from_master;
> > +     	struct vfspnode *master_pnode;
> > +     	struct vfsmount *slave_mnt, *member_mnt, *t_m;
> > +	struct pcontext context;
> > +	static void *p_array[PNODE_MAX_SLAVE_LEVEL];
> > +
> > +	context.start = pnode;
> > +	context.pnode = NULL;
> > +	/*
> > +	 * determine whether to process vfs first or the
> > +	 * slave pnode first
> > +	 */
> > +	while (pnode_next(&context)) {
> > +		level = context.level;
> > +
> > +		if (level >= PNODE_MAX_SLAVE_LEVEL)
> > +			goto error;
> > +
> > +		pnode = context.pnode;
> > +		master_pnode = context.master_pnode;
> > +
> > +		if (master_pnode) {
> > +			data_from_master = p_array[level-1];
> > +			my_data = NULL;
> > +		} else {
> > +			data_from_master = NULL;
> > +			my_data = in_data;
> > +		}
> > +
> > +		if (pnode_pre_func) {
> > +			va_start(args, vfs_func);
> > +			if((ret = pnode_pre_func(pnode,
> > +				data_from_master, &my_data, args)))
> > +				goto error;
> > +			va_end(args);
> > +		}
> > +
> > +		// traverse member vfsmounts
> > +		spin_lock(&vfspnode_lock);
> > +		list_for_each_entry_safe(member_mnt,
> > +			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
> > +
> > +			spin_unlock(&vfspnode_lock);
> > +			va_start(args, vfs_func);
> > +			if ((ret = vfs_func(member_mnt,
> > +				PNODE_MEMBER_VFS, my_data, args)))
> > +				goto error;
> > +			va_end(args);
> > +			spin_lock(&vfspnode_lock);
> > +		}
> > +		list_for_each_entry_safe(slave_mnt, t_m,
> > +			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > +
> > +			spin_unlock(&vfspnode_lock);
> > +			va_start(args, vfs_func);
> > +			if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
> > +				my_data, args)))
> > +				goto error;
> > +			va_end(args);
> > +			spin_lock(&vfspnode_lock);
> > +		}
> > +		spin_unlock(&vfspnode_lock);
> > +
> > +		if (pnode_post_func) {
> > +			va_start(args, vfs_func);
> > +			if((ret = pnode_post_func(pnode,
> > +				my_data, args)))
> > +				goto error;
> > +			va_end(args);
> > +		}
> > +
> > +		p_array[level] = my_data;
> > +	}
> > +out:
> > +	if (out_data)
> > +		*out_data = p_array[0];
> > +	return ret;
> > +error:
> > +	va_end(args);
> > +	pnode_end(&context);
> > +	goto out;
> > +}
> > 
> 
> And this is the worst part.  As I said earlier, void pointers and
> variable argument functions have no place in this kind of code.
> 
> I think you could get rid of all these if you'd implement a simple
> iterator function which returns the traversed vfsmounts.  That's
> another big argument for getting rid of struct pnode: you could do
> iteration simply by holding onto a vfsmount pointer, instead of having
> to do a two level iteration, once over pnodes, then over vfsmounts.

I can try to implement your style, which is getting rid of pnode, but
seriously that will be really messy implementation. Two many pointers
to traverse in the vfsmount, and no clear idea which path to take to
traverse the propogation tree. 

And if you are ok with pnode, than I can work on simplifying
pnode_traverse routine which has void pointers and variable arguments.
But doing that will just produce lots of redundant code. pnode_traverse
is just a abstract routine that helps all other routines traverse the
pnode tree. Its kind of a helper function.




> 
> > +extern spinlock_t vfspnode_lock;
> > +extern void __put_pnode(struct vfspnode *);
> > +
> > +static inline struct vfspnode *
> > +get_pnode(struct vfspnode *pnode)
> > +{
> > +	if (!pnode)
> > +		return NULL;
> > +	atomic_inc(&pnode->pnode_count);
> > +	return pnode;
> > +}
> > +
> > +static inline void
> > +put_pnode(struct vfspnode *pnode)
> > +{
> > +	if (!pnode)
> > +		return;
> > +	if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
> > +		__put_pnode(pnode);
> > +		spin_unlock(&vfspnode_lock);
> > +	}
> > +}
> 
> Unnecessary.
> 
> > +#define MNT_PRIVATE	0x10  /* if the vfsmount is private, by default it is private*/
> 
> If by default it's private, why is this flag needed?

yes it is not needed. Will get rid of this.

RP
> 
> Miklos


^ permalink raw reply	[flat|nested] 37+ messages in thread

* mount behavior question.
  2005-07-27 21:39     ` Ram Pai
@ 2005-07-28  7:35       ` Ram Pai
  2005-07-28 11:56         ` Miklos Szeredi
  2005-07-28 18:27         ` Bryan Henderson
  2005-07-28  9:57       ` [PATCH 1/7] shared subtree Miklos Szeredi
  1 sibling, 2 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-28  7:35 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel

Summary of the question:
	Should the topmost mount be visible, or should the most recent
	mount be visible?

consider the following command sequence

(1) cd /mnt
(2) mount --bind /usr /mnt
(3) mount --bind /bin /mnt
(4) mount --bind /var .

after step 1, the pwd of the process is pointing to root mount and
directory mnt. lets call the root mount as 'A'

after step 2, a new mount is laid on top of 'A' at the mountpoint mnt
lets call this mount 'B'

after step 3, a new mount is laid on top of 'B' at the mountpoint mnt
which corresponds to the root dentry of 'B'. lets call this new overlaid
mount as 'C'. At this point the visible content of /mnt is
the content of C.

however at step 4, a new mount is laid on top of 'A' at the same
mountpoint mnt, as that of  'B'. Lets call the new mount 'D'.

At this point, the visible content of /mnt is that of D and not that
of C

But should'nt it be C?

Why is that the contents of 'D' made visible? Is there any particular
reason for this behavior? Note: 'D' is mounted on the bottommost mount,
and hence should be obscured by the top level mounts.

To make it simpler, imagine you are viewing a 3 storied transparent
building from the top. If you place an apple in 1st floor and nothing is
placed on any other floors, the apple will be visible from the top.
Now if you place a 'orange' in the 2nd floor the apple should get
obscured by the orange and the 'orange' should start being visible. And
later if you place a 'mango' on 3rd floor, the mango should obscure both
the apple and orange.  but at this point if you place another apple on
top of the first apple in the 1st floor, it cannot be visible, because
the 'orange' and the 'mango' block its line of sight. And hence the
'mango' should still continue to be visible. right? If the apple starts
becoming visible from the top, won't it defy law of visibility? :)

Back to the mount example:
Currently the behavior is the most recent mount is visible and not the
topmost mount.

Not many will run into this question currently, because the sequence of
steps have to orchestrated well to get into this scenario. But with
shared subtrees it is pretty easy to mount something at a lower level
mount because of propogations. And in this case the behavior becomes
totally confusing if the rule is 'expose the most-recent-mount and not
the topmost-mount'.

Here is a scenario with shared subtree. Sorry it is complex.

mount --bind /mnt /mnt
mount --make-shared /mnt
mkdir -p /mnt/p
mount --bind /usr  /mnt/1
mount --bind /mnt  /mnt/2

At this stage the mount at /mnt/2 and /mnt belong to the same pnode
which means mounts under them propogate to each other.

mount --bind /var /mnt/1

the contents of /var will be visible under /mnt/1 and not under /mnt/2
But if mount --bind /var /mnt/2 is executed, the contents of /var is
visible under /mnt/1 as well as /mnt/2 . Isn't this freaky?

On analysis it turns out the culprit is the current rule which says
'expose the most-recent-mount and not the topmost mount'

RP

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/7] shared subtree
  2005-07-27 20:30     ` Ram Pai
@ 2005-07-28  8:34       ` Miklos Szeredi
  0 siblings, 0 replies; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28  8:34 UTC (permalink / raw)
  To: linuxram; +Cc: mathurav, mike, janak, linux-fsdevel, linux-kernel

> yes we agreed on returning EINVAL when a directory is attempted to made
> shared/private/slave/unclonnable.   But this is a different case.
> 
> lets say  /mnt is a mountpoint having a vfsmount (say A). 
> now if you run 
> 	mount --bind /mnt/a  /tmp
> this operation succeeds currently. 
> 
> now lets say /mnt is a mountpoint having a vfsmount which is shared.
> now if you run
> 	mount --bind /mnt/a /tmp
> 
> we now have a mount at /tmp which gets propogation from mounts under
> /mnt/a. right?

Yes.

> but /mnt/a is not a mountpoint at all.  if we need to track and
> propogate mounts/unmounts under /tmp or /mnt/a we need to have a mount
> at /mnt/a.

I don't think we do.  You can just check at propagation time if the
propagated mountpoint is visible in the destination mount or not.
Just like --rbind checks whether children mounts are below or above
the to-be-bound directory.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-27 21:39     ` Ram Pai
  2005-07-28  7:35       ` mount behavior question Ram Pai
@ 2005-07-28  9:57       ` Miklos Szeredi
  2005-07-29 19:54         ` Ram Pai
  1 sibling, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28  9:57 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel

> > This is an example, where having struct pnode just complicates things.
> > If there was no struct pnode, this function would be just one line:
> > setting the shared flag.
> So your comment is mostly about getting rid of pnode and distributing
> the pnode functionality in the vfsmount structure.

Yes, sorry if I didn't make it clear.

> I know you are thinking of just having the necessary propogation list in
> the vfsmount structure itself.  Yes true with that implementation the
> complication is reduced in this part of the code, but really complicates
> the propogation traversal routines. 

On the contrary, I think it will simplify the traversal routines.

Here's an iterator function I coded up.  Not tested at all (may not
even compile):

struct vfsmount {
	/* ... */
        
	struct list_head mnt_share;      /* circular list of shared mounts */
	struct list_head mnt_slave_list; /* list of slave mounts */
	struct list_head mnt_slave;      /* slave list entry */
	struct vfsmount *master;         /* slave is on master->mnt_slave_list */
};

static inline struct vfsmount *next_shared(struct vfsmount *p)
{
	return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
}

static inline struct vfsmount *first_slave(struct vfsmount *p)
{
	return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
}

static inline struct vfsmount *next_slave(struct vfsmount *p)
{
	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
}

static struct vfsmount *propagation_next(struct vfsmount *p,
					 struct vfsmount *base)
{
	/* first iterate over the slaves */
	if (!list_empty(&p->mnt_slave_list))
		return first_slave(p);

	while (1) {
		struct vfsmount *q;

		/* more vfsmounts belong to the pnode? */
		if (!list_empty(&p->mnt_share)) {
			p = next_shared(p);
			if (list_empty(&p->mnt_slave) && p != base)
				return p;
		}
		if (p == base)
			break;
		
		BUG_ON(list_empty(&p->mnt_slave));

		/* more slaves? */
		q = next_slave(p);
		if (p->master != q)
			return q;

		/* back at master */
		p = q;
	}

	return NULL;
}

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28  7:35       ` mount behavior question Ram Pai
@ 2005-07-28 11:56         ` Miklos Szeredi
  2005-07-28 15:02           ` Ram Pai
  2005-07-28 18:27         ` Bryan Henderson
  1 sibling, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 11:56 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel

> Here is a scenario with shared subtree. Sorry it is complex.
> 
> 
> mount --bind /mnt /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/p
> mount --bind /usr  /mnt/1
> mount --bind /mnt  /mnt/2
> 
> At this stage the mount at /mnt/2 and /mnt belong to the same pnode
> which means mounts under them propogate to each other.
> 
> mount --bind /var /mnt/1
> 
> the contents of /var will be visible under /mnt/1 and not under /mnt/2
> But if mount --bind /var /mnt/2 is executed, the contents of /var is
> visible under /mnt/1 as well as /mnt/2 . Isn't this freaky?

I don't understand.

'mount --bind /var /mnt/1' should propagate to /mnt/2/1, not /mnt/2.  No?

'mount --bind /var/ /mnt/2' should propagate to /mnt.  What am I
missing?

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 11:56         ` Miklos Szeredi
@ 2005-07-28 15:02           ` Ram Pai
  2005-07-28 15:58             ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-28 15:02 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel

On Thu, 2005-07-28 at 04:56, Miklos Szeredi wrote:
> > Here is a scenario with shared subtree. Sorry it is complex.
> > 
> > 
> > mount --bind /mnt /mnt
> > mount --make-shared /mnt
> > mkdir -p /mnt/p
> > mount --bind /usr  /mnt/1
> > mount --bind /mnt  /mnt/2
> > 
> > At this stage the mount at /mnt/2 and /mnt belong to the same pnode
> > which means mounts under them propogate to each other.
> > 
> > mount --bind /var /mnt/1
> > 
> > the contents of /var will be visible under /mnt/1 and not under /mnt/2
> > But if mount --bind /var /mnt/2 is executed, the contents of /var is
> > visible under /mnt/1 as well as /mnt/2 . Isn't this freaky?
> 
> I don't understand.
> 
> 'mount --bind /var /mnt/1' should propagate to /mnt/2/1, not /mnt/2. 

yes it should propogate to /mnt/2/1 , thats what I meant when I said
under /mnt/2, but yes I was not clear. Hope I have a clearer
explanation below.

>  No?
> 
> 'mount --bind /var/ /mnt/2' should propagate to /mnt.  What am I
> missing?


step 1: mount --bind /mnt /mnt
            a new mount 'A' is created at /mnt

step 2: mount --make-shared /mnt
           mounts under 'A' are made shared. But in this case
       there are no other mounts. So only 'A' will be made shared.
          
         
step 3: mkdir -p /mnt/1 /mnt/2
		nothing special here
		
step 4: mount --bind /usr  /mnt/1
                a new mount 'B' is created  at /mnt/1 which is
		 'shared;.
                		

step 5: mount --bind /mnt  /mnt/2

		a new mount 'C' is created at /mnt/2
		and propogation is set between 'A' and 'C'.
		note: 'C' is made shared.



lets say, at this point I try 
		mount --bind /var /mnt/1

	this is going to mount 'D' on top of mount 'B'.  However
	there is no other mount to which 'B' propogates to. So that is 
	it. the contents of /var is only visible at /mnt/1 and it
	 propogates no where else.

but lets say, we tried mount --bind /var /mnt/2/1
	/mnt/2/1 belongs to mount 'C'. And mounts under 'C' propogates to 'A'
too. So in this case a new mount 'E' is created at mnt/1/2
i.e on top of 'C' at dentry '2'  and due to propogation a new mount
'F' is created at /mnt/1 i.e on top of mount 'A' at dentry '1'
 But note: /mnt/1 already has a mount 'B' on top of it.   The new mount
'F' as per the 'most-current mount rule' obscures 'B' even though the
mount is on top of 'A'. As a result the contents of /var are now
visible both at /mnt/2/1 and /mnt/1


Ok the net effect is, mount at /mnt/1 is visible only under /mnt/1
but mount at /mnt/2/1 is visible at mount /mnt/2/1 and /mnt/1
This makes it confusing. If the 'top-most mount rule' is applied
'F' though mounted on 'A', will not be visible because it will get 
obscured by 'B' and the confusion is avoided.

So the point I am driving at is, is there any special reason 
for having 'most-recent mount visible rule' instead of 'top-most mount
visible rule'?
RP


> Miklos


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 15:02           ` Ram Pai
@ 2005-07-28 15:58             ` Miklos Szeredi
  2005-07-28 18:22               ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 15:58 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel

> step 1: mount --bind /mnt /mnt
>             a new mount 'A' is created at /mnt
> 
> step 2: mount --make-shared /mnt
>            mounts under 'A' are made shared. But in this case
>        there are no other mounts. So only 'A' will be made shared.
>           
>          
> step 3: mkdir -p /mnt/1 /mnt/2
> 		nothing special here
> 		
> step 4: mount --bind /usr  /mnt/1
>                 a new mount 'B' is created  at /mnt/1 which is
> 		 'shared;.
>                 		
> 
> step 5: mount --bind /mnt  /mnt/2
> 
> 		a new mount 'C' is created at /mnt/2
> 		and propogation is set between 'A' and 'C'.
> 		note: 'C' is made shared.
> 
> 
> 
> lets say, at this point I try 
> 		mount --bind /var /mnt/1
> 
> 	this is going to mount 'D' on top of mount 'B'.  However
> 	there is no other mount to which 'B' propogates to. So that is 
> 	it. the contents of /var is only visible at /mnt/1 and it
> 	 propogates no where else.
> 
> but lets say, we tried mount --bind /var /mnt/2/1
> 	/mnt/2/1 belongs to mount 'C'. And mounts under 'C' propogates to 'A'
> too. So in this case a new mount 'E' is created at mnt/1/2

You mean /mnt/2/1

> i.e on top of 'C' at dentry '2'

On dentry '1'

> and due to propogation a new mount 'F' is created at /mnt/1 i.e on
> top of mount 'A' at dentry '1' But note: /mnt/1 already has a mount
> 'B' on top of it.  The new mount 'F' as per the 'most-current mount
> rule' obscures 'B' even though the mount is on top of 'A'. As a
> result the contents of /var are now visible both at /mnt/2/1 and
> /mnt/1

True.  

> Ok the net effect is, mount at /mnt/1 is visible only under /mnt/1
> but mount at /mnt/2/1 is visible at mount /mnt/2/1 and /mnt/1
> This makes it confusing.

But this asymmetry is caused by the fact that mounts from /mnt/1 (B)
don't propagate, while mounts from /mnt/2/1 (C) do.  And not because
of the mount lookup semantics you want to change.  That change in fact
would only _hide_ the asymmetry and not fix it: there would be
propagation in one case and no propagation in the other.

If you did before step 4 in another shell 'cd /mnt/1', then after step
5 you did 'mount --bind /var .', that would have had the same
symmetric effect as the 'mount --bind /var /mnt/2/1'.

> So the point I am driving at is, is there any special reason 
> for having 'most-recent mount visible rule' instead of 'top-most mount
> visible rule'?

I don't think, that 'topmost mount is visible' is any more logical
than 'latest mount is visible'.  The fact that the later has been in
the kernel for a long time means, that there may be systems relying on
this behavior, and changing it would break them.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 15:58             ` Miklos Szeredi
@ 2005-07-28 18:22               ` Ram Pai
  2005-07-28 19:30                 ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-28 18:22 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel

On Thu, 2005-07-28 at 08:58, Miklos Szeredi wrote:
> > step 1: mount --bind /mnt /mnt
> >             a new mount 'A' is created at /mnt
> > 
> > step 2: mount --make-shared /mnt
> >            mounts under 'A' are made shared. But in this case
> >        there are no other mounts. So only 'A' will be made shared.
> >           
> >          
> > step 3: mkdir -p /mnt/1 /mnt/2
> > 		nothing special here
> > 		
> > step 4: mount --bind /usr  /mnt/1
> >                 a new mount 'B' is created  at /mnt/1 which is
> > 		 'shared;.
> >                 		
> > 
> > step 5: mount --bind /mnt  /mnt/2
> > 
> > 		a new mount 'C' is created at /mnt/2
> > 		and propogation is set between 'A' and 'C'.
> > 		note: 'C' is made shared.
> > 
> > 
> > 
> > lets say, at this point I try 
> > 		mount --bind /var /mnt/1
> > 
> > 	this is going to mount 'D' on top of mount 'B'.  However
> > 	there is no other mount to which 'B' propogates to. So that is 
> > 	it. the contents of /var is only visible at /mnt/1 and it
> > 	 propogates no where else.
> > 
> > but lets say, we tried mount --bind /var /mnt/2/1
> > 	/mnt/2/1 belongs to mount 'C'. And mounts under 'C' propogates to 'A'
> > too. So in this case a new mount 'E' is created at mnt/1/2
> 
> You mean /mnt/2/1
> 
> > i.e on top of 'C' at dentry '2'
> 
> On dentry '1'
> 
> > and due to propogation a new mount 'F' is created at /mnt/1 i.e on
> > top of mount 'A' at dentry '1' But note: /mnt/1 already has a mount
> > 'B' on top of it.  The new mount 'F' as per the 'most-current mount
> > rule' obscures 'B' even though the mount is on top of 'A'. As a
> > result the contents of /var are now visible both at /mnt/2/1 and
> > /mnt/1
> 
> True.  
> 
> > Ok the net effect is, mount at /mnt/1 is visible only under /mnt/1
> > but mount at /mnt/2/1 is visible at mount /mnt/2/1 and /mnt/1
> > This makes it confusing.
> 
> But this asymmetry is caused by the fact that mounts from /mnt/1 (B)
> don't propagate, while mounts from /mnt/2/1 (C) do.  And not because
> of the mount lookup semantics you want to change.  That change in fact
> would only _hide_ the asymmetry and not fix it: there would be
> propagation in one case and no propagation in the other.

no. there is no asymmetry as such. the propogations are working the way
they are meant to. But the confusion arises because of the mount lookup
symantics.  The reason Avantika(who is doing shared subtree testing),
had this exact confusion is because of the 'most-recent-mount visible'
rule. I dont think this rule is documented anywhere. And the natural
response to such a behavior is confusion.

> 
> If you did before step 4 in another shell 'cd /mnt/1', then after step
> 5 you did 'mount --bind /var .', that would have had the same
> symmetric effect as the 'mount --bind /var /mnt/2/1'.
> 
> > So the point I am driving at is, is there any special reason 
> > for having 'most-recent mount visible rule' instead of 'top-most mount
> > visible rule'?
> 
> I don't think, that 'topmost mount is visible' is any more logical
> than 'latest mount is visible'.  The fact that the later has been in
> the kernel for a long time means, that there may be systems relying on
> this behavior, and changing it would break them.

No I wont change this behavior. But this is something that will be a
source of confusion (probably misinterpreation to be a bug) as shared
subtrees use become more prominently used.

RP
> 
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28  7:35       ` mount behavior question Ram Pai
  2005-07-28 11:56         ` Miklos Szeredi
@ 2005-07-28 18:27         ` Bryan Henderson
  2005-07-28 19:01           ` Miklos Szeredi
  1 sibling, 1 reply; 37+ messages in thread
From: Bryan Henderson @ 2005-07-28 18:27 UTC (permalink / raw)
  To: linuxram
  Cc: Andrew Morton, Avantika Mathur, janak, linux-fsdevel, lmike,
	Miklos Szeredi, viro

I don't know enough about shared subtrees to have an opinion on what 
should happen with those, but you fundamentally asked about a perceived 
weirdness in existing Linux code, and I do have an opinion on that (which 
is that there's no weirdness).

>On analysis it turns out the culprit is the current rule which says
>'expose the most-recent-mount and not the topmost mount'

I don't think the current rule is "expose the most-recent-mount."  I see 
it as "expose the topmost mount."

I think the issue is what does "mount F over directory D" mean?

Does it mean to mount F immediately over D, in spite of anything that 
might be stacked above D right now?  Or does it mean to throw F onto the 
stack which is currently sitting over D?  Your analysis assumes it's the 
former, whereas what Linux does is consistent with the latter.

Neither of them actually makes sense.  mount over "." simply doesn't make 
sense.  Mount is a namespace operation.  "mount over D" says, "when 
someone looks up name D, ignore what's really in the directory and instead 
give him this other filesystem object."  "Mount over /mnt/cdrom" doesn't 
mean mount over the directory /mnt/cdrom.  It means mount under the name 
"cdrom" in the directory /mnt.  So "mount over '.'" means any future 
lookup of "." in that directory should hyperjump to the other mount. 
That's clearly not what anyone wants, so mount ought to recognize the 
special nature of the "." directory entry and not allow mounts over it.

If you did that, and made mount into the namespace operation it's meant to 
be, there would be no such thing as inserting a mount into the stack, 
since you have no way to refer to the covered directory -- it's no longer 
in the namespace.

I have no idea if that clarifies the shared subtree dilemma, but you ask 
if there's any pressing need for the current behavior, and I would have to 
say no, because a) neither behavior has any business existing; and b) I 
have a hard time imagining anyone depending on it.

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 18:27         ` Bryan Henderson
@ 2005-07-28 19:01           ` Miklos Szeredi
  2005-07-28 20:35             ` Bryan Henderson
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 19:01 UTC (permalink / raw)
  To: hbryan; +Cc: linuxram, akpm, mathurav, janak, linux-fsdevel, mike, viro

> I think the issue is what does "mount F over directory D" mean?
> 
> Does it mean to mount F immediately over D, in spite of anything that 
> might be stacked above D right now?  Or does it mean to throw F onto the 
> stack which is currently sitting over D?  Your analysis assumes it's the 
> former, whereas what Linux does is consistent with the latter.

In fact those two are indistinguishable.  What linux does is an
internal implementation detail.  The semantics are simple: if you
mount over a directory, that mount will be visible (no matter what was
previously visible) on lookup of that directory.

> Neither of them actually makes sense.  mount over "." simply doesn't make 
> sense.  Mount is a namespace operation.  "mount over D" says, "when 
> someone looks up name D, ignore what's really in the directory and instead 
> give him this other filesystem object."  "Mount over /mnt/cdrom" doesn't 
> mean mount over the directory /mnt/cdrom.  It means mount under the name 
> "cdrom" in the directory /mnt.  So "mount over '.'" means any future 
> lookup of "." in that directory should hyperjump to the other mount. 
> That's clearly not what anyone wants, so mount ought to recognize the 
> special nature of the "." directory entry and not allow mounts over it.

Well, mounting over '.' may not be perfect in the mathematical sense
of namespace operations, but it does make some practical sense.  I bet
you anything that some script/tool/person out there depends on it.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 18:22               ` Ram Pai
@ 2005-07-28 19:30                 ` Miklos Szeredi
  2005-07-28 20:09                   ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 19:30 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel

> no. there is no asymmetry as such. the propogations are working the way
> they are meant to. But the confusion arises because of the mount lookup
> symantics.  The reason Avantika(who is doing shared subtree testing),
> had this exact confusion is because of the 'most-recent-mount visible'
> rule. I dont think this rule is documented anywhere. And the natural
> response to such a behavior is confusion.

I really fail to see what you are getting at.

You agree that:

  1) mount doesn't propagate from /mnt/1 to /mnt/2/1.

  2) mount propagates from /mnt/2/1 to /mnt/1.

Then you are surprised that you don't see the same thing if you mount
on /mnt/1 as if on /mnt/2/1.

I think your proposed solution would be _more_ confusing not less,
since then I'd not see the expected propagation from /mnt/2/1 to
/mnt/1.  I'd call that a bug.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 19:30                 ` Miklos Szeredi
@ 2005-07-28 20:09                   ` Ram Pai
  2005-07-28 20:44                     ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-28 20:09 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel

On Thu, 2005-07-28 at 12:30, Miklos Szeredi wrote:
> > no. there is no asymmetry as such. the propogations are working the way
> > they are meant to. But the confusion arises because of the mount lookup
> > symantics.  The reason Avantika(who is doing shared subtree testing),
> > had this exact confusion is because of the 'most-recent-mount visible'
> > rule. I dont think this rule is documented anywhere. And the natural
> > response to such a behavior is confusion.
> 
> I really fail to see what you are getting at.
> 
> You agree that:
> 
>   1) mount doesn't propagate from /mnt/1 to /mnt/2/1.
> 
>   2) mount propagates from /mnt/2/1 to /mnt/1.

Yes I agree.

> 
> Then you are surprised that you don't see the same thing if you mount
> on /mnt/1 as if on /mnt/2/1.

I am not surprised when mounts on /mnt/1 do not propogate to /mnt/2/1
This is expected, and I am perfectly happy. Because the mount is
attempted on 'B' and 'B' has nobody to propogate to.

when mount on /mnt/2/1 (i.e on C at dentry 1) is attempted, I expect
 to see a new mount 'E' at that dentry. That is happening and
I am happy with it.
I also expect that the mount propogates to /mnt/1 too (i.e on 'A' at
dentry '1'). Because 'C' and 'A' have propogation setup.

But what I also expect to see is: the new mount 'F' at /mnt/1 ( mount A
at dentry 1) be obscured by the already existing mount on /mnt/1 i.e
mount 'B'.

And the reason I want the new mount at /mnt/1 (i.e 'F') obscured is that
the new mount is not done on 'B' but is done on 'A'.

The "most recent mount rule" makes 'B' obscured instead of 'F'
and I am expecting "the topmount visible rule" to be applicable
here which makes 'B' still visible and 'F' obscured. 

Ah...its so hard without a whiteboard :( I wish there was some way to
explain it drawing some objects on the whiteboard.

I guess, I have got all the letters and the words right. Any small
mistake can distort everything. If somebody is wondering why there is
no 'D' that is because it was used for something else in the earlier
example and hence not used here.

RP

> 
> I think your proposed solution would be _more_ confusing not less,
> since then I'd not see the expected propagation from /mnt/2/1 to
> /mnt/1.  I'd call that a bug.
> 
> Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 19:01           ` Miklos Szeredi
@ 2005-07-28 20:35             ` Bryan Henderson
  2005-07-28 20:42               ` Ram Pai
  2005-07-28 20:53               ` Miklos Szeredi
  0 siblings, 2 replies; 37+ messages in thread
From: Bryan Henderson @ 2005-07-28 20:35 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, Avantika Mathur, janak, linux-fsdevel, linuxram, mike, viro

>> Does it mean to mount F immediately over D, in spite of anything that 
>> might be stacked above D right now?  Or does it mean to throw F onto 
the 
>> stack which is currently sitting over D?  Your analysis assumes it's 
the 
>> former, whereas what Linux does is consistent with the latter.
>
>In fact those two are indistinguishable.  What linux does is an
>internal implementation detail.

Then you must have misunderstood what I meant to say, because I didn't 
touch on Linux implementation at all; I'm talking only about what a user 
sees (distinguishes).  I say a user perceives a stack of mounts over a 
directory entry D.  A lookup sees the mount which is on top of the stack. 
One could conceivably 1) add a mount to the middle of that stack -- above 
D but below everything else, such that it isn't visible until everything 
above it gets removed, or 2) add the mount to the top of the stack so it's 
visible now.

>The semantics are simple: if you
>mount over a directory, that mount will be visible (no matter what was
>previously visible) on lookup of that directory.

So in my terms, Linux adds to the top of the stack, not to the middle. 
Note that saying this is stronger than what you say above, because it 
tells you not only that the mount is visible now, but when it will be 
visible in the future as people do mounts and unmounts.

>Well, mounting over '.' may not be perfect in the mathematical sense
>of namespace operations, but it does make some practical sense.  I bet
>you anything that some script/tool/person out there depends on it.

It wouldn't surprise me if someone is depending on mount over ".".  But 
I'd be surprised if someone is doing it to a directory that's already been 
mounted over (such that the stacking behavior is relevant).  That seems 
really eccentric.

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:35             ` Bryan Henderson
@ 2005-07-28 20:42               ` Ram Pai
  2005-07-28 22:27                 ` Bryan Henderson
  2005-07-28 20:53               ` Miklos Szeredi
  1 sibling, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-28 20:42 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Miklos Szeredi, Andrew Morton, Avantika Mathur, janak,
	linux-fsdevel, mike, viro

On Thu, 2005-07-28 at 13:35, Bryan Henderson wrote:
 
> It wouldn't surprise me if someone is depending on mount over ".".  But 
> I'd be surprised if someone is doing it to a directory that's already been 
> mounted over (such that the stacking behavior is relevant).  That seems 
> really eccentric.

Bryan, what would you expect the behavior to be when somebody mounts on
a directory what is already mounted over?  

Do you expect the new mount to obscure the already existing mount or
do you expect the already existing mount to obscure the new mount?

The issue in the current thread is pretty much revolving around this.
RP

> 
> --
> Bryan Henderson                     IBM Almaden Research Center
> San Jose CA                         Filesystems
> 


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:09                   ` Ram Pai
@ 2005-07-28 20:44                     ` Miklos Szeredi
  2005-07-28 20:59                       ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 20:44 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel

> I am not surprised when mounts on /mnt/1 do not propogate to /mnt/2/1
> This is expected, and I am perfectly happy. Because the mount is
> attempted on 'B' and 'B' has nobody to propogate to.
> 
> when mount on /mnt/2/1 (i.e on C at dentry 1) is attempted, I expect
>  to see a new mount 'E' at that dentry. That is happening and
> I am happy with it.
> I also expect that the mount propogates to /mnt/1 too (i.e on 'A' at
> dentry '1'). Because 'C' and 'A' have propogation setup.
>  
> But what I also expect to see is: the new mount 'F' at /mnt/1 ( mount A
> at dentry 1) be obscured by the already existing mount on /mnt/1 i.e
> mount 'B'.
> 
> And the reason I want the new mount at /mnt/1 (i.e 'F') obscured is that
> the new mount is not done on 'B' but is done on 'A'.
> 
> The "most recent mount rule" makes 'B' obscured instead of 'F'
> and I am expecting "the topmount visible rule" to be applicable
> here which makes 'B' still visible and 'F' obscured. 

OK.  I'm beginning to get it :)

You want the propagated mount to be "tucked under" the existing mount.

Well, that's conceivably a valid semantic for the propagation.  I'm
not sure which I like better.  I think not hiding the propagated mount
is more intuitive.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:35             ` Bryan Henderson
  2005-07-28 20:42               ` Ram Pai
@ 2005-07-28 20:53               ` Miklos Szeredi
  2005-07-28 22:51                 ` Bryan Henderson
  1 sibling, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-28 20:53 UTC (permalink / raw)
  To: hbryan; +Cc: akpm, mathurav, janak, linux-fsdevel, linuxram, mike, viro

> >> Does it mean to mount F immediately over D, in spite of anything that 
> >> might be stacked above D right now?  Or does it mean to throw F onto 
> the 
> >> stack which is currently sitting over D?  Your analysis assumes it's 
> the 
> >> former, whereas what Linux does is consistent with the latter.
> >
> >In fact those two are indistinguishable.  What linux does is an
> >internal implementation detail.
> 
> Then you must have misunderstood what I meant to say, because I didn't 
> touch on Linux implementation at all;

Right, sorry.

> I'm talking only about what a user sees (distinguishes).  I say a
> user perceives a stack of mounts over a directory entry D.  A lookup
> sees the mount which is on top of the stack.  One could conceivably
> 1) add a mount to the middle of that stack -- above D but below
> everything else, such that it isn't visible until everything above
> it gets removed, or 2) add the mount to the top of the stack so it's
> visible now.

OK.

One problem with 1) is that it breaks the assumption that an 'mount X;
umount X' pair is a no-op.

> >Well, mounting over '.' may not be perfect in the mathematical sense
> >of namespace operations, but it does make some practical sense.  I bet
> >you anything that some script/tool/person out there depends on it.
> 
> It wouldn't surprise me if someone is depending on mount over ".".  But 
> I'd be surprised if someone is doing it to a directory that's already been 
> mounted over (such that the stacking behavior is relevant).  That seems 
> really eccentric.

You are probably right, but one can never know.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:44                     ` Miklos Szeredi
@ 2005-07-28 20:59                       ` Ram Pai
  0 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-28 20:59 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel

On Thu, 2005-07-28 at 13:44, Miklos Szeredi wrote:
> > I am not surprised when mounts on /mnt/1 do not propogate to /mnt/2/1
> > This is expected, and I am perfectly happy. Because the mount is
> > attempted on 'B' and 'B' has nobody to propogate to.
> > 
> > when mount on /mnt/2/1 (i.e on C at dentry 1) is attempted, I expect
> >  to see a new mount 'E' at that dentry. That is happening and
> > I am happy with it.
> > I also expect that the mount propogates to /mnt/1 too (i.e on 'A' at
> > dentry '1'). Because 'C' and 'A' have propogation setup.
> >  
> > But what I also expect to see is: the new mount 'F' at /mnt/1 ( mount A
> > at dentry 1) be obscured by the already existing mount on /mnt/1 i.e
> > mount 'B'.
> > 
> > And the reason I want the new mount at /mnt/1 (i.e 'F') obscured is that
> > the new mount is not done on 'B' but is done on 'A'.
> > 
> > The "most recent mount rule" makes 'B' obscured instead of 'F'
> > and I am expecting "the topmount visible rule" to be applicable
> > here which makes 'B' still visible and 'F' obscured. 
> 
> OK.  I'm beginning to get it :)
> 
> You want the propagated mount to be "tucked under" the existing mount.

exactly. I feel that is more intuitive. Remember the transparent
building with 3 story example that I gave in the first mail of this
thread. I am just asking for that natural behavior.
> 
> Well, that's conceivably a valid semantic for the propagation.  I'm
> not sure which I like better.  I think not hiding the propagated mount
> is more intuitive.

O!! well .. we still disagree. :)
RP
> 
> Miklos


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:42               ` Ram Pai
@ 2005-07-28 22:27                 ` Bryan Henderson
  2005-07-28 22:59                   ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Bryan Henderson @ 2005-07-28 22:27 UTC (permalink / raw)
  To: linuxram
  Cc: Andrew Morton, Avantika Mathur, janak, linux-fsdevel, mike,
	Miklos Szeredi, viro

>Bryan, what would you expect the behavior to be when somebody mounts on
>a directory what is already mounted over? 

Well, I've tried to beg the question.  I said I don't think it's 
meaningful to mount over a directory; that one actually mounts at a name. 
And that Linux's peculiar "mount over '.'" (which is in fact mounting over 
a directory and not at a name) is weird enough that there is no natural 
expectation of it except that it should fail.

But if I had to try to merge "mount over '.'" into as consistent a model 
as possible with one of the two behaviors we've been discussing, I'd say 
that "." stands for the name by which you looked up that directory in the 
first place (so in this case, it's equivalent to mount ... /mnt).  And 
that means I would expect the new mount to obscure the already existing 
mount.

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 20:53               ` Miklos Szeredi
@ 2005-07-28 22:51                 ` Bryan Henderson
  0 siblings, 0 replies; 37+ messages in thread
From: Bryan Henderson @ 2005-07-28 22:51 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, Avantika Mathur, janak, linux-fsdevel, linuxram, mike, viro

>One problem with 1) [mounting into the middle of a mount stack]
>is that it breaks the assumption that an 'mount X;
>umount X' pair is a no-op.

A very good point.  Since unmounts are always from the top of the stack, 
for symmetry mounts should be there too.

Here's another tidbit of information I just verified:  umount of "." 
unmounts from the top of the stack, as opposed to unmounting the stuff you 
would see if you did "ls .".  So this is all consistent.

--
Bryan Henderson                     IBM Almaden Research Center
San Jose CA                         Filesystems

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: mount behavior question.
  2005-07-28 22:27                 ` Bryan Henderson
@ 2005-07-28 22:59                   ` Ram Pai
  0 siblings, 0 replies; 37+ messages in thread
From: Ram Pai @ 2005-07-28 22:59 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Andrew Morton, Avantika Mathur, janak, linux-fsdevel, mike,
	Miklos Szeredi, viro

On Thu, 2005-07-28 at 15:27, Bryan Henderson wrote:
> >Bryan, what would you expect the behavior to be when somebody mounts on
> >a directory what is already mounted over? 
> 
> Well, I've tried to beg the question.  I said I don't think it's 
> meaningful to mount over a directory; that one actually mounts at a name. 
> And that Linux's peculiar "mount over '.'" (which is in fact mounting over 
> a directory and not at a name) is weird enough that there is no natural 
> expectation of it except that it should fail.
> 
> But if I had to try to merge "mount over '.'" into as consistent a model 
> as possible with one of the two behaviors we've been discussing, I'd say 
> that "." stands for the name by which you looked up that directory in the 
> first place (so in this case, it's equivalent to mount ... /mnt).  And 
> that means I would expect the new mount to obscure the already existing 
> mount.

ok. maybe I am having some odd expectations here.
To me it still feels natural to tuck the mount under the earlier mount,
since you are not mounting on something which on the top, but you
are mounting  on top of something which is under(obscured).

RP


> 
> --
> Bryan Henderson                     IBM Almaden Research Center
> San Jose CA                         Filesystems


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-28  9:57       ` [PATCH 1/7] shared subtree Miklos Szeredi
@ 2005-07-29 19:54         ` Ram Pai
  2005-07-30  5:39           ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-29 19:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
	linux-kernel

On Thu, 2005-07-28 at 02:57, Miklos Szeredi wrote:
> > > This is an example, where having struct pnode just complicates things.
> > > If there was no struct pnode, this function would be just one line:
> > > setting the shared flag.
> > So your comment is mostly about getting rid of pnode and distributing
> > the pnode functionality in the vfsmount structure.
> 
> Yes, sorry if I didn't make it clear.
> 
> > I know you are thinking of just having the necessary propogation list in
> > the vfsmount structure itself.  Yes true with that implementation the
> > complication is reduced in this part of the code, but really complicates
> > the propogation traversal routines. 
> 
> On the contrary, I think it will simplify the traversal routines.
> 
> Here's an iterator function I coded up.  Not tested at all (may not
> even compile):

Your suggested code has bugs. But I understand what you are aiming at. 

Maybe you are right. I will try out a implementation using your idea.

Hmm.. lots of code change, and testing.

> 
> struct vfsmount {
> 	/* ... */
>         
> 	struct list_head mnt_share;      /* circular list of shared mounts */
> 	struct list_head mnt_slave_list; /* list of slave mounts */
> 	struct list_head mnt_slave;      /* slave list entry */
> 	struct vfsmount *master;         /* slave is on master->mnt_slave_list */
> };
> 
> static inline struct vfsmount *next_shared(struct vfsmount *p)
> {
> 	return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
> }
> 
> static inline struct vfsmount *first_slave(struct vfsmount *p)
> {
> 	return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
> }
> 
> static inline struct vfsmount *next_slave(struct vfsmount *p)
> {
> 	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
> }
> 
> static struct vfsmount *propagation_next(struct vfsmount *p,
> 					 struct vfsmount *base)
> {
> 	/* first iterate over the slaves */
> 	if (!list_empty(&p->mnt_slave_list))
> 		return first_slave(p);

I think this code should be
		if (!list_empty(&p->mnt_slave))
			return next_slave(p);

Right? I think I get the idea. 



RP

> 
> 	while (1) {
> 		struct vfsmount *q;
> 
> 		/* more vfsmounts belong to the pnode? */
> 		if (!list_empty(&p->mnt_share)) {
> 			p = next_shared(p);
> 			if (list_empty(&p->mnt_slave) && p != base)
> 				return p;
> 		}
> 		if (p == base)
> 			break;
> 		
> 		BUG_ON(list_empty(&p->mnt_slave));
> 
> 		/* more slaves? */
> 		q = next_slave(p);
> 		if (p->master != q)
> 			return q;
> 
> 		/* back at master */
> 		p = q;
> 	}
> 
> 	return NULL;
> }
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-29 19:54         ` Ram Pai
@ 2005-07-30  5:39           ` Miklos Szeredi
  2005-07-31  0:45             ` Ram Pai
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-30  5:39 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel

> > static struct vfsmount *propagation_next(struct vfsmount *p,
> > 					 struct vfsmount *base)
> > {
> > 	/* first iterate over the slaves */
> > 	if (!list_empty(&p->mnt_slave_list))
> > 		return first_slave(p);
> 
> I think this code should be
> 		if (!list_empty(&p->mnt_slave))
> 			return next_slave(p);
> 
> Right? I think I get the idea. 

This is a depth-first search, so first_slave() is right.

Here's a less buggy (and even more simplified) version of the
function.  Note: it must be called with 'origin' either on a slave
list or at the root pnode.  That's because the function checks if the
all vfsmounts in a pnode have been traversed by looking at the
emptiness of mnt_slave.  So if origin was in a slave pnode, but is not
the actual slave link, the algorithm will go off the starting pnode
and up to it's master.

So here's a preparation function that finds the right place to start
the propagation.

static struct vfsmount *propagation_first(struct vfsmount *p)
{
	struct vfsmount *q = p;

	while (list_empty(&q->mnt_slave)) {
		q = next_shared(q);
		if (q == p)
			break;
	}
	return q;
}

static struct vfsmount *propagation_next(struct vfsmount *p,
					 struct vfsmount *origin)
{
	/* are there any slaves of this mount? */
	if (!list_empty(&p->mnt_slave_list))
		return first_slave(p);

	while (1) {
		/* if p->mnt_share is empty, this is a no-op */
		p = next_shared(p);

		/* finished traversing? */
		if (p == origin)
			break;

		/* more vfsmounts belong to the pnode? */
		if (list_empty(&p->mnt_slave))
			return p;
		
		/* more slaves? */
		if (p->mnt_slave.next != &p->mnt_master->mnt_slave_list)
			return next_slave(p);

		/* back at master */
		p = p->mnt_master;
	}

	return NULL;
}


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-30  5:39           ` Miklos Szeredi
@ 2005-07-31  0:45             ` Ram Pai
  2005-07-31  7:52               ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Ram Pai @ 2005-07-31  0:45 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
	linux-kernel

On Fri, 2005-07-29 at 22:39, Miklos Szeredi wrote:
> > > static struct vfsmount *propagation_next(struct vfsmount *p,
> > > 					 struct vfsmount *base)
> > > {
> > > 	/* first iterate over the slaves */
> > > 	if (!list_empty(&p->mnt_slave_list))
> > > 		return first_slave(p);
> > 
> > I think this code should be
> > 		if (!list_empty(&p->mnt_slave))
> > 			return next_slave(p);
> > 
> > Right? I think I get the idea. 
> 
> This is a depth-first search, so first_slave() is right.

Ok. I have started implementing your idea. But the implementation is no
simple.  Its becomes a complex mess. Atleast in the case of pnode
datastructure implementation, the propogation was all abstracted and
concentrated in the pnode datastructure. 

Here is a sample implementation of do_make_slave() with your idea. 

static int do_make_slave(struct vfsmount *mnt)
{
        int err=0;
        struct vfsmount *peer_mnt;

        spin_lock(&vfspnode_lock);
        if (!IS_MNT_SHARED(mnt)) {
                spin_unlock(&vfspnode_lock);
                err = -EINVAL;
                goto out;
        }

        peer_mnt = list_entry(mnt->mnt_share.next, struct vfsmount,
mnt_share);
        if (peer_mnt == mnt)
                peer_mnt = NULL;

        list_del_init(&mnt->mnt_share);
        if (peer_mnt) {
                /* move the slave list to the peer_mnt */
                list_splice(&mnt->mnt_slave, &peer_mnt->mnt_slave);
                list_add(&mnt->mnt_slave_list, &peer_mnt->mnt_slave);
                set_mnt_slave(mnt);
        } else {
                struct vfsmount *slave_mnt, *t_slave_mnt;
                list_for_each_entry_safe(slave_mnt, t_slave_mnt,
                                &mnt->mnt_slave, mnt_slave_list) {
                        CLEAR_MNT_SLAVE(slave_mnt);
                        list_del_init(&slave_mnt->mnt_slave_list);
                }
        }
        list_del_init(&mnt->mnt_slave);
        mnt->mnt_master = peer_mnt;
        spin_unlock(&vfspnode_lock);
out:
        return err;
}

Do you still believe that your idea is simpler? 

The most difficult part is, attaching shared vfs tree, which needs to be
attached at someother shared mount point. The problem here is while
traversing the propogation tree, one has to build another similar
propogation tree for the new child mount. Its a 2 dimensional tree walk,
i.e one walk is along the vfstree, and the other walk is along the pnode
tree for each mount.  Its much easier to abstract out the pnode
operations and concentrate on them separately than mixing the
functionality of vfs and pnode in a single vfs datastructure.

In my code, I have abstracted out the pnode tree traversing operation
using a single iterator function pnode_next().

Think about it, and let me know if it is worth the effort of changing
the implementation. I sincerely feel it just shifts complexity instead
of reducing complexity. I can eventually come up with a fully tested
implementation using your idea, but I am still not convinced that
it reduces complexity.


RP

> 
> Here's a less buggy (and even more simplified) version of the
> function.  Note: it must be called with 'origin' either on a slave
> list or at the root pnode.  That's because the function checks if the
> all vfsmounts in a pnode have been traversed by looking at the
> emptiness of mnt_slave.  So if origin was in a slave pnode, but is not
> the actual slave link, the algorithm will go off the starting pnode
> and up to it's master.
> 
> So here's a preparation function that finds the right place to start
> the propagation.
> 
> static struct vfsmount *propagation_first(struct vfsmount *p)
> {
> 	struct vfsmount *q = p;
> 
> 	while (list_empty(&q->mnt_slave)) {
> 		q = next_shared(q);
> 		if (q == p)
> 			break;
> 	}
> 	return q;
> }
> 
> static struct vfsmount *propagation_next(struct vfsmount *p,
> 					 struct vfsmount *origin)
> {
> 	/* are there any slaves of this mount? */
> 	if (!list_empty(&p->mnt_slave_list))
> 		return first_slave(p);
> 
> 	while (1) {
> 		/* if p->mnt_share is empty, this is a no-op */
> 		p = next_shared(p);
> 
> 		/* finished traversing? */
> 		if (p == origin)
> 			break;
> 
> 		/* more vfsmounts belong to the pnode? */
> 		if (list_empty(&p->mnt_slave))
> 			return p;
> 		
> 		/* more slaves? */
> 		if (p->mnt_slave.next != &p->mnt_master->mnt_slave_list)
> 			return next_slave(p);
> 
> 		/* back at master */
> 		p = p->mnt_master;
> 	}
> 
> 	return NULL;
> }
> 


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-31  0:45             ` Ram Pai
@ 2005-07-31  7:52               ` Miklos Szeredi
  2005-07-31  8:25                 ` Miklos Szeredi
  0 siblings, 1 reply; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-31  7:52 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel

> Ok. I have started implementing your idea. But the implementation is no
> simple.  Its becomes a complex mess. Atleast in the case of pnode
> datastructure implementation, the propogation was all abstracted and
> concentrated in the pnode datastructure. 
> 
> Here is a sample implementation of do_make_slave() with your idea. 
> 
> static int do_make_slave(struct vfsmount *mnt)
> {
>         int err=0;
>         struct vfsmount *peer_mnt;
> 
>         spin_lock(&vfspnode_lock);
>         if (!IS_MNT_SHARED(mnt)) {
>                 spin_unlock(&vfspnode_lock);
>                 err = -EINVAL;
>                 goto out;
>         }
> 
>         peer_mnt = list_entry(mnt->mnt_share.next, struct vfsmount,
> mnt_share);
>         if (peer_mnt == mnt)
>                 peer_mnt = NULL;
> 
>         list_del_init(&mnt->mnt_share);
>         if (peer_mnt) {
>                 /* move the slave list to the peer_mnt */
>                 list_splice(&mnt->mnt_slave, &peer_mnt->mnt_slave);
>                 list_add(&mnt->mnt_slave_list, &peer_mnt->mnt_slave);
>                 set_mnt_slave(mnt);
>         } else {
>                 struct vfsmount *slave_mnt, *t_slave_mnt;
>                 list_for_each_entry_safe(slave_mnt, t_slave_mnt,
>                                 &mnt->mnt_slave, mnt_slave_list) {
>                         CLEAR_MNT_SLAVE(slave_mnt);
>                         list_del_init(&slave_mnt->mnt_slave_list);
>                 }
>         }
>         list_del_init(&mnt->mnt_slave);
>         mnt->mnt_master = peer_mnt;
>         spin_unlock(&vfspnode_lock);
> out:
>         return err;
> }
> 
> Do you still believe that your idea is simpler? 

Well, you have bundled do_make_slave(), pnode_member_to_slave() and
empty_pnode() all into one function.  I think your original split is
quite nice.  If you'd split this function up like that, I think you'd
agree, that the end result is simpler.

Btw. though you don't have a pnode datastructure, that doesn't mean
you can't separate the shared subtree related functions into a
separate pnode.c (or whatever) file.  I think the do_make_* functions
also belong in there, not namespace.c.   

> The most difficult part is, attaching shared vfs tree, which needs
> to be attached at someother shared mount point. The problem here is
> while traversing the propogation tree, one has to build another
> similar propogation tree for the new child mount. Its a 2
> dimensional tree walk, i.e one walk is along the vfstree, and the
> other walk is along the pnode tree for each mount.  Its much easier
> to abstract out the pnode operations and concentrate on them
> separately than mixing the functionality of vfs and pnode in a
> single vfs datastructure.

I think you are bluring the distinction between two separate concepts:
"structure" and "function".

You can concentrate separate "structures" in a single object, without
having to mix the "functions".

The linux 'struct list' is a wonderful way to do this.

C++ inheritance is another (much less wonderful) way.

Mixing "functions" is bad.  Concentrating "structure" is good.

> In my code, I have abstracted out the pnode tree traversing operation
> using a single iterator function pnode_next().

Well, but the pnode traversing is only part of the problem.  In the
end you really need to traverse vfsmounts.  Pnodes are really just a
conceptual helper.  I think Al Viro didn't make that clear enough in
his RFC.

> Think about it, and let me know if it is worth the effort of changing
> the implementation. I sincerely feel it just shifts complexity instead
> of reducing complexity. I can eventually come up with a fully tested
> implementation using your idea, but I am still not convinced that
> it reduces complexity.

I really don't want to force it on you.  If you feel comfortable with
the current code, that's OK.  But keep in mind, that this is a central
part of the kernel, and so the code needs to be understood to the last
little detail, not just by you.  And doing that with >1000 new lines
of code is not an easy task.

So the less code (and less complexity) you are able to make this
functionality work, the easier it will be for others to understand and
accept it.

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/7] shared subtree
  2005-07-31  7:52               ` Miklos Szeredi
@ 2005-07-31  8:25                 ` Miklos Szeredi
  0 siblings, 0 replies; 37+ messages in thread
From: Miklos Szeredi @ 2005-07-31  8:25 UTC (permalink / raw)
  To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel

> > Do you still believe that your idea is simpler? 
> 
> Well, you have bundled do_make_slave(), pnode_member_to_slave() and
> empty_pnode() all into one function.  I think your original split is
> quite nice.  If you'd split this function up like that, I think you'd
> agree, that the end result is simpler.

Also you can still use the pnode concept in naming functions and
explanations.  For example empty_pnode() is a good function name even
if there's no 'struct pnode'.  Pnodes still exist, they just don't
have a corresponding object.  

Miklos

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2005-07-31  8:26 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-07-25 22:44 (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-25 22:44 ` (unknown) Ram Pai
2005-07-26  2:53 ` supposed to be shared subtree patches Ram Pai
     [not found] ` <20050725225908.031752000@localhost>
2005-07-27 19:13   ` [PATCH 3/7] shared subtree Miklos Szeredi
2005-07-27 20:30     ` Ram Pai
2005-07-28  8:34       ` Miklos Szeredi
     [not found] ` <20050725225907.007405000@localhost>
2005-07-27 19:54   ` [PATCH 1/7] " Miklos Szeredi
2005-07-27 21:39     ` Ram Pai
2005-07-28  7:35       ` mount behavior question Ram Pai
2005-07-28 11:56         ` Miklos Szeredi
2005-07-28 15:02           ` Ram Pai
2005-07-28 15:58             ` Miklos Szeredi
2005-07-28 18:22               ` Ram Pai
2005-07-28 19:30                 ` Miklos Szeredi
2005-07-28 20:09                   ` Ram Pai
2005-07-28 20:44                     ` Miklos Szeredi
2005-07-28 20:59                       ` Ram Pai
2005-07-28 18:27         ` Bryan Henderson
2005-07-28 19:01           ` Miklos Szeredi
2005-07-28 20:35             ` Bryan Henderson
2005-07-28 20:42               ` Ram Pai
2005-07-28 22:27                 ` Bryan Henderson
2005-07-28 22:59                   ` Ram Pai
2005-07-28 20:53               ` Miklos Szeredi
2005-07-28 22:51                 ` Bryan Henderson
2005-07-28  9:57       ` [PATCH 1/7] shared subtree Miklos Szeredi
2005-07-29 19:54         ` Ram Pai
2005-07-30  5:39           ` Miklos Szeredi
2005-07-31  0:45             ` Ram Pai
2005-07-31  7:52               ` Miklos Szeredi
2005-07-31  8:25                 ` Miklos Szeredi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).