* (no subject)
2005-07-25 22:44 Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-27 19:54 ` [PATCH 1/7] shared subtree Miklos Szeredi
2005-07-25 22:44 ` Ram Pai
` (6 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 1/7] shared subtree
Content-Type: text/x-patch; name=shared_private_slave.patch
Content-Disposition: inline; filename=shared_private_slave.patch
This patch adds the shared/private/slave support for VFS trees.
Signed by Ram Pai (linuxram@us.ibm.com)
fs/Makefile | 2
fs/dcache.c | 2
fs/namespace.c | 93 ++++++++++
fs/pnode.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 5
include/linux/mount.h | 44 ++++
include/linux/pnode.h | 90 ++++++++++
7 files changed, 673 insertions(+), 4 deletions(-)
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -22,6 +22,7 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/mount.h>
+#include <linux/pnode.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -62,6 +63,7 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_fslink);
+ INIT_LIST_HEAD(&mnt->mnt_pnode_mntlist);
if (name) {
int size = strlen(name)+1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -615,6 +617,95 @@ out_unlock:
return err;
}
+static int do_make_shared(struct vfsmount *mnt)
+{
+ int err=0;
+ struct vfspnode *old_pnode = NULL;
+ /*
+ * if the mount is already a slave mount,
+ * allocate a new pnode and make it
+ * a slave pnode of the original pnode.
+ */
+ if (IS_MNT_SLAVE(mnt)) {
+ old_pnode = mnt->mnt_pnode;
+ pnode_del_slave_mnt(mnt);
+ }
+ if(!IS_MNT_SHARED(mnt)) {
+ mnt->mnt_pnode = pnode_alloc();
+ if(!mnt->mnt_pnode) {
+ pnode_add_slave_mnt(old_pnode, mnt);
+ err = -ENOMEM;
+ goto out;
+ }
+ pnode_add_member_mnt(mnt->mnt_pnode, mnt);
+ }
+ if(old_pnode)
+ pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
+ set_mnt_shared(mnt);
+out:
+ return err;
+}
+
+static int do_make_slave(struct vfsmount *mnt)
+{
+ int err=0;
+
+ if (IS_MNT_SLAVE(mnt))
+ goto out;
+ /*
+ * only shared mounts can
+ * be made slave
+ */
+ if (!IS_MNT_SHARED(mnt)) {
+ err = -EINVAL;
+ goto out;
+ }
+ pnode_member_to_slave(mnt);
+out:
+ return err;
+}
+
+static int do_make_private(struct vfsmount *mnt)
+{
+ if(mnt->mnt_pnode)
+ pnode_disassociate_mnt(mnt);
+ set_mnt_private(mnt);
+ return 0;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct nameidata *nd, int flag)
+{
+ struct vfsmount *m, *mnt = nd->mnt;
+ int err=0;
+
+ if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+ && !(flag & MS_SLAVE))
+ return -EINVAL;
+
+ if (nd->dentry != nd->mnt->mnt_root)
+ return -EINVAL;
+
+ spin_lock(&vfsmount_lock);
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ switch (flag) {
+ case MS_SHARED:
+ err = do_make_shared(m);
+ break;
+ case MS_SLAVE:
+ err = do_make_slave(m);
+ break;
+ case MS_PRIVATE:
+ err = do_make_private(m);
+ break;
+ }
+ }
+ spin_unlock(&vfsmount_lock);
+ return err;
+}
+
/*
* do loopback mount.
*/
@@ -1049,6 +1140,8 @@ long do_mount(char * dev_name, char * di
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+ retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
else
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- /dev/null
+++ 2.6.12.work2/fs/pnode.c
@@ -0,0 +1,441 @@
+/*
+ * linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ * Released under GPL v2.
+ * Author : Ram Pai (linuxram@us.ibm.com)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/acct.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/namespace.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/pnode.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <stdarg.h>
+
+
+static kmem_cache_t * pnode_cachep;
+
+/* spinlock for pnode related operations */
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+
+enum pnode_vfs_type {
+ PNODE_MEMBER_VFS = 0x01,
+ PNODE_SLAVE_VFS = 0x02
+};
+
+void __init pnode_init(unsigned long mempages)
+{
+ pnode_cachep = kmem_cache_create("pnode_cache",
+ sizeof(struct vfspnode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
+
+struct vfspnode * pnode_alloc(void)
+{
+ struct vfspnode *pnode = kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
+ INIT_LIST_HEAD(&pnode->pnode_vfs);
+ INIT_LIST_HEAD(&pnode->pnode_slavevfs);
+ INIT_LIST_HEAD(&pnode->pnode_slavepnode);
+ INIT_LIST_HEAD(&pnode->pnode_peer_slave);
+ pnode->pnode_master = NULL;
+ pnode->pnode_flags = 0;
+ atomic_set(&pnode->pnode_count,0);
+ return pnode;
+}
+
+void inline pnode_free(struct vfspnode *pnode)
+{
+ kmem_cache_free(pnode_cachep, pnode);
+}
+
+/*
+ * __put_pnode() should be called with vfspnode_lock held
+ */
+void __put_pnode(struct vfspnode *pnode)
+{
+ struct vfspnode *tmp_pnode;
+ do {
+ tmp_pnode = pnode->pnode_master;
+ list_del_init(&pnode->pnode_peer_slave);
+ BUG_ON(!list_empty(&pnode->pnode_vfs));
+ BUG_ON(!list_empty(&pnode->pnode_slavevfs));
+ BUG_ON(!list_empty(&pnode->pnode_slavepnode));
+ pnode_free(pnode);
+ pnode = tmp_pnode;
+ if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
+ break;
+ } while(pnode);
+}
+
+static void inline pnode_add_mnt(struct vfspnode *pnode,
+ struct vfsmount *mnt, int slave)
+{
+ if (!pnode || !mnt)
+ return;
+ spin_lock(&vfspnode_lock);
+ mnt->mnt_pnode = pnode;
+ if (slave) {
+ set_mnt_slave(mnt);
+ list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+ } else {
+ set_mnt_shared(mnt);
+ list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_vfs);
+ }
+ get_pnode(pnode);
+ spin_unlock(&vfspnode_lock);
+}
+
+void pnode_add_member_mnt(struct vfspnode *pnode,
+ struct vfsmount *mnt)
+{
+ pnode_add_mnt(pnode, mnt, 0);
+}
+
+void pnode_add_slave_mnt(struct vfspnode *pnode,
+ struct vfsmount *mnt)
+{
+ pnode_add_mnt(pnode, mnt, 1);
+}
+
+
+void pnode_add_slave_pnode(struct vfspnode *pnode,
+ struct vfspnode *slave_pnode)
+{
+ if (!pnode || !slave_pnode)
+ return;
+ spin_lock(&vfspnode_lock);
+ slave_pnode->pnode_master = pnode;
+ slave_pnode->pnode_flags = 0;
+ list_add(&slave_pnode->pnode_peer_slave, &pnode->pnode_slavepnode);
+ get_pnode(pnode);
+ spin_unlock(&vfspnode_lock);
+}
+
+/*
+ * merge 'pnode' into 'peer_pnode' and get rid of pnode
+ * @pnode: pnode the contents of which have to be merged
+ * @peer_pnode: pnode into which the contents are merged
+ */
+int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
+{
+ struct vfspnode *slave_pnode, *pnext;
+ struct vfsmount *mnt, *slave_mnt, *next;
+
+ list_for_each_entry_safe(slave_pnode, pnext,
+ &pnode->pnode_slavepnode, pnode_peer_slave) {
+ slave_pnode->pnode_master = peer_pnode;
+ list_move(&slave_pnode->pnode_peer_slave,
+ &peer_pnode->pnode_slavepnode);
+ put_pnode_locked(pnode);
+ get_pnode(peer_pnode);
+ }
+
+ list_for_each_entry_safe(slave_mnt, next,
+ &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+ slave_mnt->mnt_pnode = peer_pnode;
+ list_move(&slave_mnt->mnt_pnode_mntlist,
+ &peer_pnode->pnode_slavevfs);
+ put_pnode_locked(pnode);
+ get_pnode(peer_pnode);
+ }
+
+ list_for_each_entry_safe(mnt, next,
+ &pnode->pnode_vfs, mnt_pnode_mntlist) {
+ mnt->mnt_pnode = peer_pnode;
+ list_move(&mnt->mnt_pnode_mntlist,
+ &peer_pnode->pnode_vfs);
+ put_pnode_locked(pnode);
+ get_pnode(peer_pnode);
+ }
+ return 0;
+}
+
+/*
+ * called when pnode has no member mounts. Merge all the slave mounts/pnodes
+ * of this pnode with that of its master pnode. If master pnode does not exit,
+ * convert all the slave mounts to private mounts.
+ */
+static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
+ *next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
+
+ if ((master_pnode = pnode->pnode_master)) {
+ pnode->pnode_master = NULL;
+ list_del_init(&pnode->pnode_peer_slave);
+ pnode_merge_pnode(pnode, master_pnode);
+ put_pnode_locked(master_pnode);
+ } else {
+ list_for_each_entry_safe(slave_mnt, next,
+ &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+ list_del_init(&slave_mnt->mnt_pnode_mntlist);
+ set_mnt_private(slave_mnt);
+ put_pnode_locked(pnode);
+ }
+ list_for_each_entry_safe(slave_pnode, pnext,
+ &pnode->pnode_slavepnode, pnode_peer_slave) {
+ slave_pnode->pnode_master = NULL;
+ list_del_init(&slave_pnode->pnode_peer_slave);
+ put_pnode_locked(pnode);
+ }
+ }
+}
+
+static void __pnode_disassociate_mnt(struct vfsmount *mnt)
+{
+ struct vfspnode *pnode = mnt->mnt_pnode;
+
+ spin_lock(&vfspnode_lock);
+ list_del_init(&mnt->mnt_pnode_mntlist);
+
+ if (list_empty(&pnode->pnode_vfs))
+ empty_pnode(pnode);
+
+ put_pnode_locked(pnode);
+
+ spin_unlock(&vfspnode_lock);
+ mnt->mnt_pnode = NULL;
+}
+
+void pnode_del_slave_mnt(struct vfsmount *mnt)
+{
+ if (!mnt || !mnt->mnt_pnode)
+ return;
+ __pnode_disassociate_mnt(mnt);
+ CLEAR_MNT_SLAVE(mnt);
+}
+
+void pnode_del_member_mnt(struct vfsmount *mnt)
+{
+ if (!mnt || !mnt->mnt_pnode)
+ return;
+ __pnode_disassociate_mnt(mnt);
+ CLEAR_MNT_SHARED(mnt);
+}
+
+void pnode_member_to_slave(struct vfsmount *mnt)
+{
+ struct vfspnode *pnode = mnt->mnt_pnode;
+ if (!mnt || !pnode)
+ return;
+
+ spin_lock(&vfspnode_lock);
+
+ list_del_init(&mnt->mnt_pnode_mntlist);
+ list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+ set_mnt_slave(mnt);
+
+ if (list_empty(&pnode->pnode_vfs))
+ empty_pnode(pnode);
+
+ spin_unlock(&vfspnode_lock);
+ return;
+}
+
+void pnode_disassociate_mnt(struct vfsmount *mnt)
+{
+ if (!mnt || !mnt->mnt_pnode)
+ return;
+ __pnode_disassociate_mnt(mnt);
+ CLEAR_MNT_SHARED(mnt);
+ CLEAR_MNT_SLAVE(mnt);
+}
+
+struct pcontext {
+ struct vfspnode *start;
+ int level;
+ struct vfspnode *master_pnode;
+ struct vfspnode *pnode;
+};
+
+/*
+ * Walk the pnode tree for each pnode encountered.
+ * @context: provides context on the state of the last walk in the pnode
+ * tree.
+ */
+static int pnode_next(struct pcontext *context)
+{
+ struct vfspnode *pnode = context->pnode;
+ struct vfspnode *master_pnode=context->master_pnode;
+ struct list_head *next;
+
+ if (!pnode) {
+ BUG_ON(!context->start);
+ get_pnode(context->start);
+ context->pnode = context->start;
+ context->master_pnode = NULL;
+ context->level = 0;
+ return 1;
+ }
+
+ spin_lock(&vfspnode_lock);
+ next = pnode->pnode_slavepnode.next;
+ if (next == &pnode->pnode_slavepnode) {
+ while (1) {
+ int flag;
+
+ if (pnode == context->start) {
+ put_pnode_locked(pnode);
+ spin_unlock(&vfspnode_lock);
+ BUG_ON(context->level != 0);
+ return 0;
+ }
+
+ next = pnode->pnode_peer_slave.next;
+ flag = (next != &pnode->pnode_master->pnode_slavepnode);
+ put_pnode_locked(pnode);
+
+ if (flag)
+ break;
+
+ pnode = master_pnode;
+ master_pnode = pnode->pnode_master;
+ context->level--;
+ }
+ } else {
+ master_pnode = pnode;
+ context->level++;
+ }
+
+ pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
+ get_pnode(pnode);
+
+ context->pnode = pnode;
+ context->master_pnode = master_pnode;
+ spin_unlock(&vfspnode_lock);
+ return 1;
+}
+
+/*
+ * skip the rest of the tree, cleaning up
+ * reference to pnodes held in pnode_next().
+ */
+static void pnode_end(struct pcontext *context)
+{
+ struct vfspnode *p = context->pnode;
+ struct vfspnode *start = context->start;
+
+ do {
+ put_pnode(p);
+ } while (p != start && (p = p->pnode_master));
+ return;
+}
+
+/*
+ * traverse the pnode tree and at each pnode encountered, execute the
+ * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
+ *
+ * @pnode: pnode tree to be traversed
+ * @in_data: input data
+ * @out_data: output data
+ * @pnode_func: function to be called when a new pnode is encountered.
+ * @vfs_func: function to be called on each slave and member vfs belonging
+ * to the pnode.
+ */
+static int pnode_traverse(struct vfspnode *pnode,
+ void *in_data,
+ void **out_data,
+ int (*pnode_pre_func)(struct vfspnode *,
+ void *, void **, va_list),
+ int (*pnode_post_func)(struct vfspnode *,
+ void *, va_list),
+ int (*vfs_func)(struct vfsmount *,
+ enum pnode_vfs_type, void *, va_list),
+ ...)
+{
+ va_list args;
+ int ret = 0, level;
+ void *my_data, *data_from_master;
+ struct vfspnode *master_pnode;
+ struct vfsmount *slave_mnt, *member_mnt, *t_m;
+ struct pcontext context;
+ static void *p_array[PNODE_MAX_SLAVE_LEVEL];
+
+ context.start = pnode;
+ context.pnode = NULL;
+ /*
+ * determine whether to process vfs first or the
+ * slave pnode first
+ */
+ while (pnode_next(&context)) {
+ level = context.level;
+
+ if (level >= PNODE_MAX_SLAVE_LEVEL)
+ goto error;
+
+ pnode = context.pnode;
+ master_pnode = context.master_pnode;
+
+ if (master_pnode) {
+ data_from_master = p_array[level-1];
+ my_data = NULL;
+ } else {
+ data_from_master = NULL;
+ my_data = in_data;
+ }
+
+ if (pnode_pre_func) {
+ va_start(args, vfs_func);
+ if((ret = pnode_pre_func(pnode,
+ data_from_master, &my_data, args)))
+ goto error;
+ va_end(args);
+ }
+
+ // traverse member vfsmounts
+ spin_lock(&vfspnode_lock);
+ list_for_each_entry_safe(member_mnt,
+ t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
+
+ spin_unlock(&vfspnode_lock);
+ va_start(args, vfs_func);
+ if ((ret = vfs_func(member_mnt,
+ PNODE_MEMBER_VFS, my_data, args)))
+ goto error;
+ va_end(args);
+ spin_lock(&vfspnode_lock);
+ }
+ list_for_each_entry_safe(slave_mnt, t_m,
+ &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+
+ spin_unlock(&vfspnode_lock);
+ va_start(args, vfs_func);
+ if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
+ my_data, args)))
+ goto error;
+ va_end(args);
+ spin_lock(&vfspnode_lock);
+ }
+ spin_unlock(&vfspnode_lock);
+
+ if (pnode_post_func) {
+ va_start(args, vfs_func);
+ if((ret = pnode_post_func(pnode,
+ my_data, args)))
+ goto error;
+ va_end(args);
+ }
+
+ p_array[level] = my_data;
+ }
+out:
+ if (out_data)
+ *out_data = p_array[0];
+ return ret;
+error:
+ va_end(args);
+ pnode_end(&context);
+ goto out;
+}
Index: 2.6.12.work2/fs/dcache.c
===================================================================
--- 2.6.12.work2.orig/fs/dcache.c
+++ 2.6.12.work2/fs/dcache.c
@@ -27,6 +27,7 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/file.h>
+#include <linux/pnode.h>
#include <asm/uaccess.h>
#include <linux/security.h>
#include <linux/seqlock.h>
@@ -1737,6 +1738,7 @@ void __init vfs_caches_init(unsigned lon
inode_init(mempages);
files_init(mempages);
mnt_init(mempages);
+ pnode_init(mempages);
bdev_cache_init();
chrdev_init();
}
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -102,6 +102,9 @@ extern int dir_notify_enable;
#define MS_MOVE 8192
#define MS_REC 16384
#define MS_VERBOSE 32768
+#define MS_PRIVATE (1<<18) /* recursively change to private */
+#define MS_SLAVE (1<<19) /* recursively change to slave */
+#define MS_SHARED (1<<20) /* recursively change to shared */
#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)
@@ -232,6 +235,7 @@ extern void update_atime (struct inode *
extern void __init inode_init(unsigned long);
extern void __init inode_init_early(void);
extern void __init mnt_init(unsigned long);
+extern void __init pnode_init(unsigned long);
extern void __init files_init(unsigned long);
struct buffer_head;
@@ -1211,6 +1215,7 @@ extern struct vfsmount *kern_mount(struc
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
extern int vfs_statfs(struct super_block *, struct kstatfs *);
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- /dev/null
+++ 2.6.12.work2/include/linux/pnode.h
@@ -0,0 +1,90 @@
+/*
+ * linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ * Released under GPL v2.
+ *
+ */
+#ifndef _LINUX_PNODE_H
+#define _LINUX_PNODE_H
+
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct vfspnode {
+ struct list_head pnode_vfs; /* list of vfsmounts anchored here */
+ struct list_head pnode_slavevfs; /* list of slave vfsmounts */
+ struct list_head pnode_slavepnode;/* list of slave pnode */
+ struct list_head pnode_peer_slave;/* going through master's slave pnode
+ list*/
+ struct vfspnode *pnode_master; /* master pnode */
+ int pnode_flags;
+ atomic_t pnode_count;
+};
+#define PNODE_MAX_SLAVE_LEVEL 32 /* MAXIMUM DEPTH OF THE PNODE TREE */
+#define PNODE_DELETE 0x01
+#define PNODE_SLAVE 0x02
+
+#define IS_PNODE_DELETE(pn) ((pn->pnode_flags&PNODE_DELETE)==PNODE_DELETE)
+#define IS_PNODE_SLAVE(pn) ((pn->pnode_flags&PNODE_SLAVE)==PNODE_SLAVE)
+#define SET_PNODE_DELETE(pn) pn->pnode_flags |= PNODE_DELETE
+#define SET_PNODE_SLAVE(pn) pn->pnode_flags |= PNODE_SLAVE
+
+extern spinlock_t vfspnode_lock;
+extern void __put_pnode(struct vfspnode *);
+
+static inline struct vfspnode *
+get_pnode(struct vfspnode *pnode)
+{
+ if (!pnode)
+ return NULL;
+ atomic_inc(&pnode->pnode_count);
+ return pnode;
+}
+
+static inline void
+put_pnode(struct vfspnode *pnode)
+{
+ if (!pnode)
+ return;
+ if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
+ __put_pnode(pnode);
+ spin_unlock(&vfspnode_lock);
+ }
+}
+
+/*
+ * must be called holding the vfspnode_lock
+ */
+static inline void
+put_pnode_locked(struct vfspnode *pnode)
+{
+ if (!pnode)
+ return;
+ if (atomic_dec_and_test(&pnode->pnode_count)) {
+ __put_pnode(pnode);
+ }
+}
+
+void __init pnode_init(unsigned long );
+struct vfspnode * pnode_alloc(void);
+void pnode_add_slave_mnt(struct vfspnode *, struct vfsmount *);
+void pnode_add_member_mnt(struct vfspnode *, struct vfsmount *);
+void pnode_del_slave_mnt(struct vfsmount *);
+void pnode_del_member_mnt(struct vfsmount *);
+void pnode_disassociate_mnt(struct vfsmount *);
+void pnode_add_slave_pnode(struct vfspnode *, struct vfspnode *);
+struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
+ struct dentry *);
+void pnode_member_to_slave(struct vfsmount *);
+int pnode_merge_pnode(struct vfspnode *, struct vfspnode *);
+struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
+ struct dentry *);
+int pnode_make_unmounted(struct vfspnode *);
+int pnode_prepare_mount(struct vfspnode *, struct vfspnode *, struct dentry *,
+ struct vfsmount *, struct vfsmount *);
+int pnode_commit_mount(struct vfspnode *, int);
+int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
+#endif /* _LINUX_PNODE_H */
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -16,9 +16,21 @@
#include <linux/spinlock.h>
#include <asm/atomic.h>
-#define MNT_NOSUID 1
-#define MNT_NODEV 2
-#define MNT_NOEXEC 4
+#define MNT_NOSUID 0x01
+#define MNT_NODEV 0x02
+#define MNT_NOEXEC 0x04
+#define MNT_PRIVATE 0x10 /* if the vfsmount is private, by default it is private*/
+#define MNT_SLAVE 0x20 /* if the vfsmount is a slave mount of its pnode */
+#define MNT_SHARED 0x40 /* if the vfsmount is a slave mount of its pnode */
+#define MNT_PNODE_MASK 0xf0 /* propogation flag mask */
+
+#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
+#define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+
+#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
+#define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
+#define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
struct vfsmount
{
@@ -29,6 +41,10 @@ struct vfsmount
struct super_block *mnt_sb; /* pointer to superblock */
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
+ struct list_head mnt_pnode_mntlist;/* and going through their
+ pnode's vfsmount */
+ struct vfspnode *mnt_pnode; /* and going through their
+ pnode's vfsmount */
atomic_t mnt_count;
int mnt_flags;
int mnt_expiry_mark; /* true if marked for expiry */
@@ -38,6 +54,28 @@ struct vfsmount
struct namespace *mnt_namespace; /* containing namespace */
};
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+ mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
+ CLEAR_MNT_PRIVATE(mnt);
+ CLEAR_MNT_SLAVE(mnt);
+}
+
+static inline void set_mnt_private(struct vfsmount *mnt)
+{
+ mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
+ CLEAR_MNT_SLAVE(mnt);
+ CLEAR_MNT_SHARED(mnt);
+ mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_slave(struct vfsmount *mnt)
+{
+ mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SLAVE;
+ CLEAR_MNT_PRIVATE(mnt);
+ CLEAR_MNT_SHARED(mnt);
+}
+
static inline struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)
Index: 2.6.12.work2/fs/Makefile
===================================================================
--- 2.6.12.work2.orig/fs/Makefile
+++ 2.6.12.work2/fs/Makefile
@@ -8,7 +8,7 @@
obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
block_dev.o char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
- attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
+ attr.o bad_inode.o file.o filesystems.o namespace.o pnode.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
obj-$(CONFIG_EPOLL) += eventpoll.o
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-25 22:44 ` Ram Pai
@ 2005-07-27 19:54 ` Miklos Szeredi
2005-07-27 21:39 ` Ram Pai
0 siblings, 1 reply; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-27 19:54 UTC (permalink / raw)
To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel
> +static int do_make_shared(struct vfsmount *mnt)
> +{
> + int err=0;
> + struct vfspnode *old_pnode = NULL;
> + /*
> + * if the mount is already a slave mount,
> + * allocate a new pnode and make it
> + * a slave pnode of the original pnode.
> + */
> + if (IS_MNT_SLAVE(mnt)) {
> + old_pnode = mnt->mnt_pnode;
> + pnode_del_slave_mnt(mnt);
> + }
> + if(!IS_MNT_SHARED(mnt)) {
> + mnt->mnt_pnode = pnode_alloc();
> + if(!mnt->mnt_pnode) {
> + pnode_add_slave_mnt(old_pnode, mnt);
> + err = -ENOMEM;
> + goto out;
> + }
> + pnode_add_member_mnt(mnt->mnt_pnode, mnt);
> + }
> + if(old_pnode)
> + pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
> + set_mnt_shared(mnt);
> +out:
> + return err;
> +}
This is an example, where having struct pnode just complicates things.
If there was no struct pnode, this function would be just one line:
setting the shared flag.
> +static kmem_cache_t * pnode_cachep;
> +
> +/* spinlock for pnode related operations */
> + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> +
> +enum pnode_vfs_type {
> + PNODE_MEMBER_VFS = 0x01,
> + PNODE_SLAVE_VFS = 0x02
> +};
> +
> +void __init pnode_init(unsigned long mempages)
> +{
> + pnode_cachep = kmem_cache_create("pnode_cache",
> + sizeof(struct vfspnode), 0,
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> +}
> +
> +struct vfspnode * pnode_alloc(void)
> +{
> + struct vfspnode *pnode = kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
> + INIT_LIST_HEAD(&pnode->pnode_vfs);
> + INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> + INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> + INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> + pnode->pnode_master = NULL;
> + pnode->pnode_flags = 0;
> + atomic_set(&pnode->pnode_count,0);
> + return pnode;
> +}
> +
> +void inline pnode_free(struct vfspnode *pnode)
> +{
> + kmem_cache_free(pnode_cachep, pnode);
> +}
> +
> +/*
> + * __put_pnode() should be called with vfspnode_lock held
> + */
> +void __put_pnode(struct vfspnode *pnode)
> +{
> + struct vfspnode *tmp_pnode;
> + do {
> + tmp_pnode = pnode->pnode_master;
> + list_del_init(&pnode->pnode_peer_slave);
> + BUG_ON(!list_empty(&pnode->pnode_vfs));
> + BUG_ON(!list_empty(&pnode->pnode_slavevfs));
> + BUG_ON(!list_empty(&pnode->pnode_slavepnode));
> + pnode_free(pnode);
> + pnode = tmp_pnode;
> + if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
> + break;
> + } while(pnode);
> +}
> +
All these are really unnecessary IMO.
> +/*
> + * merge 'pnode' into 'peer_pnode' and get rid of pnode
> + * @pnode: pnode the contents of which have to be merged
> + * @peer_pnode: pnode into which the contents are merged
> + */
> +int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
> +{
> + struct vfspnode *slave_pnode, *pnext;
> + struct vfsmount *mnt, *slave_mnt, *next;
> +
> + list_for_each_entry_safe(slave_pnode, pnext,
> + &pnode->pnode_slavepnode, pnode_peer_slave) {
> + slave_pnode->pnode_master = peer_pnode;
> + list_move(&slave_pnode->pnode_peer_slave,
> + &peer_pnode->pnode_slavepnode);
> + put_pnode_locked(pnode);
> + get_pnode(peer_pnode);
> + }
> +
> + list_for_each_entry_safe(slave_mnt, next,
> + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> + slave_mnt->mnt_pnode = peer_pnode;
> + list_move(&slave_mnt->mnt_pnode_mntlist,
> + &peer_pnode->pnode_slavevfs);
> + put_pnode_locked(pnode);
> + get_pnode(peer_pnode);
> + }
> +
> + list_for_each_entry_safe(mnt, next,
> + &pnode->pnode_vfs, mnt_pnode_mntlist) {
> + mnt->mnt_pnode = peer_pnode;
> + list_move(&mnt->mnt_pnode_mntlist,
> + &peer_pnode->pnode_vfs);
> + put_pnode_locked(pnode);
> + get_pnode(peer_pnode);
> + }
> + return 0;
> +}
Much overcomplication. It would just be a list_splice(), if there was
no struct pnode.
> +static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
> + *next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
> +
> + if ((master_pnode = pnode->pnode_master)) {
> + pnode->pnode_master = NULL;
> + list_del_init(&pnode->pnode_peer_slave);
> + pnode_merge_pnode(pnode, master_pnode);
> + put_pnode_locked(master_pnode);
> + } else {
> + list_for_each_entry_safe(slave_mnt, next,
> + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> + list_del_init(&slave_mnt->mnt_pnode_mntlist);
> + set_mnt_private(slave_mnt);
> + put_pnode_locked(pnode);
> + }
> + list_for_each_entry_safe(slave_pnode, pnext,
> + &pnode->pnode_slavepnode, pnode_peer_slave) {
> + slave_pnode->pnode_master = NULL;
> + list_del_init(&slave_pnode->pnode_peer_slave);
> + put_pnode_locked(pnode);
> + }
> + }
> +}
> +
Unnecessary.
> +static int pnode_next(struct pcontext *context)
> +{
> + struct vfspnode *pnode = context->pnode;
> + struct vfspnode *master_pnode=context->master_pnode;
> + struct list_head *next;
> +
> + if (!pnode) {
> + BUG_ON(!context->start);
> + get_pnode(context->start);
> + context->pnode = context->start;
> + context->master_pnode = NULL;
> + context->level = 0;
> + return 1;
> + }
> +
> + spin_lock(&vfspnode_lock);
> + next = pnode->pnode_slavepnode.next;
> + if (next == &pnode->pnode_slavepnode) {
> + while (1) {
> + int flag;
> +
> + if (pnode == context->start) {
> + put_pnode_locked(pnode);
> + spin_unlock(&vfspnode_lock);
> + BUG_ON(context->level != 0);
> + return 0;
> + }
> +
> + next = pnode->pnode_peer_slave.next;
> + flag = (next != &pnode->pnode_master->pnode_slavepnode);
> + put_pnode_locked(pnode);
> +
> + if (flag)
> + break;
> +
> + pnode = master_pnode;
> + master_pnode = pnode->pnode_master;
> + context->level--;
> + }
> + } else {
> + master_pnode = pnode;
> + context->level++;
> + }
> +
> + pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
> + get_pnode(pnode);
> +
> + context->pnode = pnode;
> + context->master_pnode = master_pnode;
> + spin_unlock(&vfspnode_lock);
> + return 1;
> +}
> +
> +/*
> + * skip the rest of the tree, cleaning up
> + * reference to pnodes held in pnode_next().
> + */
> +static void pnode_end(struct pcontext *context)
> +{
> + struct vfspnode *p = context->pnode;
> + struct vfspnode *start = context->start;
> +
> + do {
> + put_pnode(p);
> + } while (p != start && (p = p->pnode_master));
> + return;
> +}
> +
> +/*
> + * traverse the pnode tree and at each pnode encountered, execute the
> + * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
> + *
> + * @pnode: pnode tree to be traversed
> + * @in_data: input data
> + * @out_data: output data
> + * @pnode_func: function to be called when a new pnode is encountered.
> + * @vfs_func: function to be called on each slave and member vfs belonging
> + * to the pnode.
> + */
> +static int pnode_traverse(struct vfspnode *pnode,
> + void *in_data,
> + void **out_data,
> + int (*pnode_pre_func)(struct vfspnode *,
> + void *, void **, va_list),
> + int (*pnode_post_func)(struct vfspnode *,
> + void *, va_list),
> + int (*vfs_func)(struct vfsmount *,
> + enum pnode_vfs_type, void *, va_list),
> + ...)
> +{
> + va_list args;
> + int ret = 0, level;
> + void *my_data, *data_from_master;
> + struct vfspnode *master_pnode;
> + struct vfsmount *slave_mnt, *member_mnt, *t_m;
> + struct pcontext context;
> + static void *p_array[PNODE_MAX_SLAVE_LEVEL];
> +
> + context.start = pnode;
> + context.pnode = NULL;
> + /*
> + * determine whether to process vfs first or the
> + * slave pnode first
> + */
> + while (pnode_next(&context)) {
> + level = context.level;
> +
> + if (level >= PNODE_MAX_SLAVE_LEVEL)
> + goto error;
> +
> + pnode = context.pnode;
> + master_pnode = context.master_pnode;
> +
> + if (master_pnode) {
> + data_from_master = p_array[level-1];
> + my_data = NULL;
> + } else {
> + data_from_master = NULL;
> + my_data = in_data;
> + }
> +
> + if (pnode_pre_func) {
> + va_start(args, vfs_func);
> + if((ret = pnode_pre_func(pnode,
> + data_from_master, &my_data, args)))
> + goto error;
> + va_end(args);
> + }
> +
> + // traverse member vfsmounts
> + spin_lock(&vfspnode_lock);
> + list_for_each_entry_safe(member_mnt,
> + t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
> +
> + spin_unlock(&vfspnode_lock);
> + va_start(args, vfs_func);
> + if ((ret = vfs_func(member_mnt,
> + PNODE_MEMBER_VFS, my_data, args)))
> + goto error;
> + va_end(args);
> + spin_lock(&vfspnode_lock);
> + }
> + list_for_each_entry_safe(slave_mnt, t_m,
> + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> +
> + spin_unlock(&vfspnode_lock);
> + va_start(args, vfs_func);
> + if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
> + my_data, args)))
> + goto error;
> + va_end(args);
> + spin_lock(&vfspnode_lock);
> + }
> + spin_unlock(&vfspnode_lock);
> +
> + if (pnode_post_func) {
> + va_start(args, vfs_func);
> + if((ret = pnode_post_func(pnode,
> + my_data, args)))
> + goto error;
> + va_end(args);
> + }
> +
> + p_array[level] = my_data;
> + }
> +out:
> + if (out_data)
> + *out_data = p_array[0];
> + return ret;
> +error:
> + va_end(args);
> + pnode_end(&context);
> + goto out;
> +}
>
And this is the worst part. As I said earlier, void pointers and
variable argument functions have no place in this kind of code.
I think you could get rid of all these if you'd implement a simple
iterator function which returns the traversed vfsmounts. That's
another big argument for getting rid of struct pnode: you could do
iteration simply by holding onto a vfsmount pointer, instead of having
to do a two level iteration, once over pnodes, then over vfsmounts.
> +extern spinlock_t vfspnode_lock;
> +extern void __put_pnode(struct vfspnode *);
> +
> +static inline struct vfspnode *
> +get_pnode(struct vfspnode *pnode)
> +{
> + if (!pnode)
> + return NULL;
> + atomic_inc(&pnode->pnode_count);
> + return pnode;
> +}
> +
> +static inline void
> +put_pnode(struct vfspnode *pnode)
> +{
> + if (!pnode)
> + return;
> + if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
> + __put_pnode(pnode);
> + spin_unlock(&vfspnode_lock);
> + }
> +}
Unnecessary.
> +#define MNT_PRIVATE 0x10 /* if the vfsmount is private, by default it is private*/
If by default it's private, why is this flag needed?
Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-27 19:54 ` [PATCH 1/7] shared subtree Miklos Szeredi
@ 2005-07-27 21:39 ` Ram Pai
2005-07-28 9:57 ` Miklos Szeredi
0 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-27 21:39 UTC (permalink / raw)
To: Miklos Szeredi
Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
linux-kernel
On Wed, 2005-07-27 at 12:54, Miklos Szeredi wrote:
> > +static int do_make_shared(struct vfsmount *mnt)
> > +{
> > + int err=0;
> > + struct vfspnode *old_pnode = NULL;
> > + /*
> > + * if the mount is already a slave mount,
> > + * allocate a new pnode and make it
> > + * a slave pnode of the original pnode.
> > + */
> > + if (IS_MNT_SLAVE(mnt)) {
> > + old_pnode = mnt->mnt_pnode;
> > + pnode_del_slave_mnt(mnt);
> > + }
> > + if(!IS_MNT_SHARED(mnt)) {
> > + mnt->mnt_pnode = pnode_alloc();
> > + if(!mnt->mnt_pnode) {
> > + pnode_add_slave_mnt(old_pnode, mnt);
> > + err = -ENOMEM;
> > + goto out;
> > + }
> > + pnode_add_member_mnt(mnt->mnt_pnode, mnt);
> > + }
> > + if(old_pnode)
> > + pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
> > + set_mnt_shared(mnt);
> > +out:
> > + return err;
> > +}
>
> This is an example, where having struct pnode just complicates things.
> If there was no struct pnode, this function would be just one line:
> setting the shared flag.
So your comment is mostly about getting rid of pnode and distributing
the pnode functionality in the vfsmount structure.
I know you are thinking of just having the necessary propogation list in
the vfsmount structure itself. Yes true with that implementation the
complication is reduced in this part of the code, but really complicates
the propogation traversal routines.
In order to find out the slaves of a given mount:
with your proposal: I have to walk through all the peer mounts of this
mount and check for any slaves there.
in my implementation: I have to just find which pnode it belongs to, and
all the slaves are easily available there.
In order to find out all the shared mounts that are slave of this
mount:
with your proposal: Not sure how to do. Maybe you have to have another
field in each of the vfsmounts that will point to
the shared mounts that are slave of this mount.??
in my implemenation: I have to just find the pnode it belongs to,
and all the slave pnodes are easily available there.
There is complexity tradeoffs in both the implementations. But I
personally felt having a pnode structure keeps the pnode operations
seperated out cleanly. It helps to easily visualize the propogation
tree. And also one more thing influenced my thought process. The
statement in Al Viro's RFC:
-----------------------------------------------------------------------
How do we set them up?
* we can mark a subtree sharable. Every vfsmount in the subtree
that is not already in some p-node gets a single-element p-node of its
own.
* we can mark a subtree slave. That removes all vfsmounts in
the subtree from their p-nodes and makes them owned by said p-nodes.
p-nodes that became empty will disappear and everything they used to
own will be repossessed by their owners (if any).
* we can mark a subtree private. Same as above, but followed
by taking all vfsmounts in our subtree and making them *not* owned
by anybody.
--------------------------------------------------------------------
The above statements imply some implementation detail. Not sure if you
will buy this point :)
>
> > +static kmem_cache_t * pnode_cachep;
> > +
> > +/* spinlock for pnode related operations */
> > + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> > +
> > +enum pnode_vfs_type {
> > + PNODE_MEMBER_VFS = 0x01,
> > + PNODE_SLAVE_VFS = 0x02
> > +};
> > +
> > +void __init pnode_init(unsigned long mempages)
> > +{
> > + pnode_cachep = kmem_cache_create("pnode_cache",
> > + sizeof(struct vfspnode), 0,
> > + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> > +}
> > +
> > +struct vfspnode * pnode_alloc(void)
> > +{
> > + struct vfspnode *pnode = kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
> > + INIT_LIST_HEAD(&pnode->pnode_vfs);
> > + INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> > + INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> > + INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> > + pnode->pnode_master = NULL;
> > + pnode->pnode_flags = 0;
> > + atomic_set(&pnode->pnode_count,0);
> > + return pnode;
> > +}
> > +
> > +void inline pnode_free(struct vfspnode *pnode)
> > +{
> > + kmem_cache_free(pnode_cachep, pnode);
> > +}
> > +
> > +/*
> > + * __put_pnode() should be called with vfspnode_lock held
> > + */
> > +void __put_pnode(struct vfspnode *pnode)
> > +{
> > + struct vfspnode *tmp_pnode;
> > + do {
> > + tmp_pnode = pnode->pnode_master;
> > + list_del_init(&pnode->pnode_peer_slave);
> > + BUG_ON(!list_empty(&pnode->pnode_vfs));
> > + BUG_ON(!list_empty(&pnode->pnode_slavevfs));
> > + BUG_ON(!list_empty(&pnode->pnode_slavepnode));
> > + pnode_free(pnode);
> > + pnode = tmp_pnode;
> > + if (!pnode || !atomic_dec_and_test(&pnode->pnode_count))
> > + break;
> > + } while(pnode);
> > +}
> > +
>
> All these are really unnecessary IMO.
>
> > +/*
> > + * merge 'pnode' into 'peer_pnode' and get rid of pnode
> > + * @pnode: pnode the contents of which have to be merged
> > + * @peer_pnode: pnode into which the contents are merged
> > + */
> > +int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
> > +{
> > + struct vfspnode *slave_pnode, *pnext;
> > + struct vfsmount *mnt, *slave_mnt, *next;
> > +
> > + list_for_each_entry_safe(slave_pnode, pnext,
> > + &pnode->pnode_slavepnode, pnode_peer_slave) {
> > + slave_pnode->pnode_master = peer_pnode;
> > + list_move(&slave_pnode->pnode_peer_slave,
> > + &peer_pnode->pnode_slavepnode);
> > + put_pnode_locked(pnode);
> > + get_pnode(peer_pnode);
> > + }
> > +
> > + list_for_each_entry_safe(slave_mnt, next,
> > + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > + slave_mnt->mnt_pnode = peer_pnode;
> > + list_move(&slave_mnt->mnt_pnode_mntlist,
> > + &peer_pnode->pnode_slavevfs);
> > + put_pnode_locked(pnode);
> > + get_pnode(peer_pnode);
> > + }
> > +
> > + list_for_each_entry_safe(mnt, next,
> > + &pnode->pnode_vfs, mnt_pnode_mntlist) {
> > + mnt->mnt_pnode = peer_pnode;
> > + list_move(&mnt->mnt_pnode_mntlist,
> > + &peer_pnode->pnode_vfs);
> > + put_pnode_locked(pnode);
> > + get_pnode(peer_pnode);
> > + }
> > + return 0;
> > +}
>
> Much overcomplication. It would just be a list_splice(), if there was
> no struct pnode.
yes. with your proposal this will be list_splice().
>
>
> > +static void empty_pnode(struct vfspnode *pnode) { struct vfsmount *slave_mnt,
> > + *next; struct vfspnode *master_pnode, *slave_pnode, *pnext;
> > +
> > + if ((master_pnode = pnode->pnode_master)) {
> > + pnode->pnode_master = NULL;
> > + list_del_init(&pnode->pnode_peer_slave);
> > + pnode_merge_pnode(pnode, master_pnode);
> > + put_pnode_locked(master_pnode);
> > + } else {
> > + list_for_each_entry_safe(slave_mnt, next,
> > + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > + list_del_init(&slave_mnt->mnt_pnode_mntlist);
> > + set_mnt_private(slave_mnt);
> > + put_pnode_locked(pnode);
> > + }
> > + list_for_each_entry_safe(slave_pnode, pnext,
> > + &pnode->pnode_slavepnode, pnode_peer_slave) {
> > + slave_pnode->pnode_master = NULL;
> > + list_del_init(&slave_pnode->pnode_peer_slave);
> > + put_pnode_locked(pnode);
> > + }
> > + }
> > +}
> > +
>
> Unnecessary.
may be this function can be merged with the functionality in
pnode_merge(). I will look into that.
>
> > +static int pnode_next(struct pcontext *context)
> > +{
> > + struct vfspnode *pnode = context->pnode;
> > + struct vfspnode *master_pnode=context->master_pnode;
> > + struct list_head *next;
> > +
> > + if (!pnode) {
> > + BUG_ON(!context->start);
> > + get_pnode(context->start);
> > + context->pnode = context->start;
> > + context->master_pnode = NULL;
> > + context->level = 0;
> > + return 1;
> > + }
> > +
> > + spin_lock(&vfspnode_lock);
> > + next = pnode->pnode_slavepnode.next;
> > + if (next == &pnode->pnode_slavepnode) {
> > + while (1) {
> > + int flag;
> > +
> > + if (pnode == context->start) {
> > + put_pnode_locked(pnode);
> > + spin_unlock(&vfspnode_lock);
> > + BUG_ON(context->level != 0);
> > + return 0;
> > + }
> > +
> > + next = pnode->pnode_peer_slave.next;
> > + flag = (next != &pnode->pnode_master->pnode_slavepnode);
> > + put_pnode_locked(pnode);
> > +
> > + if (flag)
> > + break;
> > +
> > + pnode = master_pnode;
> > + master_pnode = pnode->pnode_master;
> > + context->level--;
> > + }
> > + } else {
> > + master_pnode = pnode;
> > + context->level++;
> > + }
> > +
> > + pnode = list_entry(next, struct vfspnode, pnode_peer_slave);
> > + get_pnode(pnode);
> > +
> > + context->pnode = pnode;
> > + context->master_pnode = master_pnode;
> > + spin_unlock(&vfspnode_lock);
> > + return 1;
> > +}
> > +
> > +/*
> > + * skip the rest of the tree, cleaning up
> > + * reference to pnodes held in pnode_next().
> > + */
> > +static void pnode_end(struct pcontext *context)
> > +{
> > + struct vfspnode *p = context->pnode;
> > + struct vfspnode *start = context->start;
> > +
> > + do {
> > + put_pnode(p);
> > + } while (p != start && (p = p->pnode_master));
> > + return;
> > +}
> > +
> > +/*
> > + * traverse the pnode tree and at each pnode encountered, execute the
> > + * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
> > + *
> > + * @pnode: pnode tree to be traversed
> > + * @in_data: input data
> > + * @out_data: output data
> > + * @pnode_func: function to be called when a new pnode is encountered.
> > + * @vfs_func: function to be called on each slave and member vfs belonging
> > + * to the pnode.
> > + */
> > +static int pnode_traverse(struct vfspnode *pnode,
> > + void *in_data,
> > + void **out_data,
> > + int (*pnode_pre_func)(struct vfspnode *,
> > + void *, void **, va_list),
> > + int (*pnode_post_func)(struct vfspnode *,
> > + void *, va_list),
> > + int (*vfs_func)(struct vfsmount *,
> > + enum pnode_vfs_type, void *, va_list),
> > + ...)
> > +{
> > + va_list args;
> > + int ret = 0, level;
> > + void *my_data, *data_from_master;
> > + struct vfspnode *master_pnode;
> > + struct vfsmount *slave_mnt, *member_mnt, *t_m;
> > + struct pcontext context;
> > + static void *p_array[PNODE_MAX_SLAVE_LEVEL];
> > +
> > + context.start = pnode;
> > + context.pnode = NULL;
> > + /*
> > + * determine whether to process vfs first or the
> > + * slave pnode first
> > + */
> > + while (pnode_next(&context)) {
> > + level = context.level;
> > +
> > + if (level >= PNODE_MAX_SLAVE_LEVEL)
> > + goto error;
> > +
> > + pnode = context.pnode;
> > + master_pnode = context.master_pnode;
> > +
> > + if (master_pnode) {
> > + data_from_master = p_array[level-1];
> > + my_data = NULL;
> > + } else {
> > + data_from_master = NULL;
> > + my_data = in_data;
> > + }
> > +
> > + if (pnode_pre_func) {
> > + va_start(args, vfs_func);
> > + if((ret = pnode_pre_func(pnode,
> > + data_from_master, &my_data, args)))
> > + goto error;
> > + va_end(args);
> > + }
> > +
> > + // traverse member vfsmounts
> > + spin_lock(&vfspnode_lock);
> > + list_for_each_entry_safe(member_mnt,
> > + t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
> > +
> > + spin_unlock(&vfspnode_lock);
> > + va_start(args, vfs_func);
> > + if ((ret = vfs_func(member_mnt,
> > + PNODE_MEMBER_VFS, my_data, args)))
> > + goto error;
> > + va_end(args);
> > + spin_lock(&vfspnode_lock);
> > + }
> > + list_for_each_entry_safe(slave_mnt, t_m,
> > + &pnode->pnode_slavevfs, mnt_pnode_mntlist) {
> > +
> > + spin_unlock(&vfspnode_lock);
> > + va_start(args, vfs_func);
> > + if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
> > + my_data, args)))
> > + goto error;
> > + va_end(args);
> > + spin_lock(&vfspnode_lock);
> > + }
> > + spin_unlock(&vfspnode_lock);
> > +
> > + if (pnode_post_func) {
> > + va_start(args, vfs_func);
> > + if((ret = pnode_post_func(pnode,
> > + my_data, args)))
> > + goto error;
> > + va_end(args);
> > + }
> > +
> > + p_array[level] = my_data;
> > + }
> > +out:
> > + if (out_data)
> > + *out_data = p_array[0];
> > + return ret;
> > +error:
> > + va_end(args);
> > + pnode_end(&context);
> > + goto out;
> > +}
> >
>
> And this is the worst part. As I said earlier, void pointers and
> variable argument functions have no place in this kind of code.
>
> I think you could get rid of all these if you'd implement a simple
> iterator function which returns the traversed vfsmounts. That's
> another big argument for getting rid of struct pnode: you could do
> iteration simply by holding onto a vfsmount pointer, instead of having
> to do a two level iteration, once over pnodes, then over vfsmounts.
I can try to implement your style, which is getting rid of pnode, but
seriously that will be really messy implementation. Two many pointers
to traverse in the vfsmount, and no clear idea which path to take to
traverse the propogation tree.
And if you are ok with pnode, than I can work on simplifying
pnode_traverse routine which has void pointers and variable arguments.
But doing that will just produce lots of redundant code. pnode_traverse
is just a abstract routine that helps all other routines traverse the
pnode tree. Its kind of a helper function.
>
> > +extern spinlock_t vfspnode_lock;
> > +extern void __put_pnode(struct vfspnode *);
> > +
> > +static inline struct vfspnode *
> > +get_pnode(struct vfspnode *pnode)
> > +{
> > + if (!pnode)
> > + return NULL;
> > + atomic_inc(&pnode->pnode_count);
> > + return pnode;
> > +}
> > +
> > +static inline void
> > +put_pnode(struct vfspnode *pnode)
> > +{
> > + if (!pnode)
> > + return;
> > + if (atomic_dec_and_lock(&pnode->pnode_count, &vfspnode_lock)) {
> > + __put_pnode(pnode);
> > + spin_unlock(&vfspnode_lock);
> > + }
> > +}
>
> Unnecessary.
>
> > +#define MNT_PRIVATE 0x10 /* if the vfsmount is private, by default it is private*/
>
> If by default it's private, why is this flag needed?
yes it is not needed. Will get rid of this.
RP
>
> Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-27 21:39 ` Ram Pai
@ 2005-07-28 9:57 ` Miklos Szeredi
2005-07-29 19:54 ` Ram Pai
0 siblings, 1 reply; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-28 9:57 UTC (permalink / raw)
To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel
> > This is an example, where having struct pnode just complicates things.
> > If there was no struct pnode, this function would be just one line:
> > setting the shared flag.
> So your comment is mostly about getting rid of pnode and distributing
> the pnode functionality in the vfsmount structure.
Yes, sorry if I didn't make it clear.
> I know you are thinking of just having the necessary propogation list in
> the vfsmount structure itself. Yes true with that implementation the
> complication is reduced in this part of the code, but really complicates
> the propogation traversal routines.
On the contrary, I think it will simplify the traversal routines.
Here's an iterator function I coded up. Not tested at all (may not
even compile):
struct vfsmount {
/* ... */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list; /* list of slave mounts */
struct list_head mnt_slave; /* slave list entry */
struct vfsmount *master; /* slave is on master->mnt_slave_list */
};
static inline struct vfsmount *next_shared(struct vfsmount *p)
{
return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
}
static inline struct vfsmount *first_slave(struct vfsmount *p)
{
return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
}
static inline struct vfsmount *next_slave(struct vfsmount *p)
{
return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
}
static struct vfsmount *propagation_next(struct vfsmount *p,
struct vfsmount *base)
{
/* first iterate over the slaves */
if (!list_empty(&p->mnt_slave_list))
return first_slave(p);
while (1) {
struct vfsmount *q;
/* more vfsmounts belong to the pnode? */
if (!list_empty(&p->mnt_share)) {
p = next_shared(p);
if (list_empty(&p->mnt_slave) && p != base)
return p;
}
if (p == base)
break;
BUG_ON(list_empty(&p->mnt_slave));
/* more slaves? */
q = next_slave(p);
if (p->master != q)
return q;
/* back at master */
p = q;
}
return NULL;
}
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-28 9:57 ` Miklos Szeredi
@ 2005-07-29 19:54 ` Ram Pai
2005-07-30 5:39 ` Miklos Szeredi
0 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-29 19:54 UTC (permalink / raw)
To: Miklos Szeredi
Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
linux-kernel
On Thu, 2005-07-28 at 02:57, Miklos Szeredi wrote:
> > > This is an example, where having struct pnode just complicates things.
> > > If there was no struct pnode, this function would be just one line:
> > > setting the shared flag.
> > So your comment is mostly about getting rid of pnode and distributing
> > the pnode functionality in the vfsmount structure.
>
> Yes, sorry if I didn't make it clear.
>
> > I know you are thinking of just having the necessary propogation list in
> > the vfsmount structure itself. Yes true with that implementation the
> > complication is reduced in this part of the code, but really complicates
> > the propogation traversal routines.
>
> On the contrary, I think it will simplify the traversal routines.
>
> Here's an iterator function I coded up. Not tested at all (may not
> even compile):
Your suggested code has bugs. But I understand what you are aiming at.
Maybe you are right. I will try out a implementation using your idea.
Hmm.. lots of code change, and testing.
>
> struct vfsmount {
> /* ... */
>
> struct list_head mnt_share; /* circular list of shared mounts */
> struct list_head mnt_slave_list; /* list of slave mounts */
> struct list_head mnt_slave; /* slave list entry */
> struct vfsmount *master; /* slave is on master->mnt_slave_list */
> };
>
> static inline struct vfsmount *next_shared(struct vfsmount *p)
> {
> return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
> }
>
> static inline struct vfsmount *first_slave(struct vfsmount *p)
> {
> return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
> }
>
> static inline struct vfsmount *next_slave(struct vfsmount *p)
> {
> return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
> }
>
> static struct vfsmount *propagation_next(struct vfsmount *p,
> struct vfsmount *base)
> {
> /* first iterate over the slaves */
> if (!list_empty(&p->mnt_slave_list))
> return first_slave(p);
I think this code should be
if (!list_empty(&p->mnt_slave))
return next_slave(p);
Right? I think I get the idea.
RP
>
> while (1) {
> struct vfsmount *q;
>
> /* more vfsmounts belong to the pnode? */
> if (!list_empty(&p->mnt_share)) {
> p = next_shared(p);
> if (list_empty(&p->mnt_slave) && p != base)
> return p;
> }
> if (p == base)
> break;
>
> BUG_ON(list_empty(&p->mnt_slave));
>
> /* more slaves? */
> q = next_slave(p);
> if (p->master != q)
> return q;
>
> /* back at master */
> p = q;
> }
>
> return NULL;
> }
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-29 19:54 ` Ram Pai
@ 2005-07-30 5:39 ` Miklos Szeredi
2005-07-31 0:45 ` Ram Pai
0 siblings, 1 reply; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-30 5:39 UTC (permalink / raw)
To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel
> > static struct vfsmount *propagation_next(struct vfsmount *p,
> > struct vfsmount *base)
> > {
> > /* first iterate over the slaves */
> > if (!list_empty(&p->mnt_slave_list))
> > return first_slave(p);
>
> I think this code should be
> if (!list_empty(&p->mnt_slave))
> return next_slave(p);
>
> Right? I think I get the idea.
This is a depth-first search, so first_slave() is right.
Here's a less buggy (and even more simplified) version of the
function. Note: it must be called with 'origin' either on a slave
list or at the root pnode. That's because the function checks if the
all vfsmounts in a pnode have been traversed by looking at the
emptiness of mnt_slave. So if origin was in a slave pnode, but is not
the actual slave link, the algorithm will go off the starting pnode
and up to it's master.
So here's a preparation function that finds the right place to start
the propagation.
static struct vfsmount *propagation_first(struct vfsmount *p)
{
struct vfsmount *q = p;
while (list_empty(&q->mnt_slave)) {
q = next_shared(q);
if (q == p)
break;
}
return q;
}
static struct vfsmount *propagation_next(struct vfsmount *p,
struct vfsmount *origin)
{
/* are there any slaves of this mount? */
if (!list_empty(&p->mnt_slave_list))
return first_slave(p);
while (1) {
/* if p->mnt_share is empty, this is a no-op */
p = next_shared(p);
/* finished traversing? */
if (p == origin)
break;
/* more vfsmounts belong to the pnode? */
if (list_empty(&p->mnt_slave))
return p;
/* more slaves? */
if (p->mnt_slave.next != &p->mnt_master->mnt_slave_list)
return next_slave(p);
/* back at master */
p = p->mnt_master;
}
return NULL;
}
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-30 5:39 ` Miklos Szeredi
@ 2005-07-31 0:45 ` Ram Pai
2005-07-31 7:52 ` Miklos Szeredi
0 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-31 0:45 UTC (permalink / raw)
To: Miklos Szeredi
Cc: Andrew Morton, viro, Avantika Mathur, mike, janak, linux-fsdevel,
linux-kernel
On Fri, 2005-07-29 at 22:39, Miklos Szeredi wrote:
> > > static struct vfsmount *propagation_next(struct vfsmount *p,
> > > struct vfsmount *base)
> > > {
> > > /* first iterate over the slaves */
> > > if (!list_empty(&p->mnt_slave_list))
> > > return first_slave(p);
> >
> > I think this code should be
> > if (!list_empty(&p->mnt_slave))
> > return next_slave(p);
> >
> > Right? I think I get the idea.
>
> This is a depth-first search, so first_slave() is right.
Ok. I have started implementing your idea. But the implementation is no
simple. Its becomes a complex mess. Atleast in the case of pnode
datastructure implementation, the propogation was all abstracted and
concentrated in the pnode datastructure.
Here is a sample implementation of do_make_slave() with your idea.
static int do_make_slave(struct vfsmount *mnt)
{
int err=0;
struct vfsmount *peer_mnt;
spin_lock(&vfspnode_lock);
if (!IS_MNT_SHARED(mnt)) {
spin_unlock(&vfspnode_lock);
err = -EINVAL;
goto out;
}
peer_mnt = list_entry(mnt->mnt_share.next, struct vfsmount,
mnt_share);
if (peer_mnt == mnt)
peer_mnt = NULL;
list_del_init(&mnt->mnt_share);
if (peer_mnt) {
/* move the slave list to the peer_mnt */
list_splice(&mnt->mnt_slave, &peer_mnt->mnt_slave);
list_add(&mnt->mnt_slave_list, &peer_mnt->mnt_slave);
set_mnt_slave(mnt);
} else {
struct vfsmount *slave_mnt, *t_slave_mnt;
list_for_each_entry_safe(slave_mnt, t_slave_mnt,
&mnt->mnt_slave, mnt_slave_list) {
CLEAR_MNT_SLAVE(slave_mnt);
list_del_init(&slave_mnt->mnt_slave_list);
}
}
list_del_init(&mnt->mnt_slave);
mnt->mnt_master = peer_mnt;
spin_unlock(&vfspnode_lock);
out:
return err;
}
Do you still believe that your idea is simpler?
The most difficult part is, attaching shared vfs tree, which needs to be
attached at someother shared mount point. The problem here is while
traversing the propogation tree, one has to build another similar
propogation tree for the new child mount. Its a 2 dimensional tree walk,
i.e one walk is along the vfstree, and the other walk is along the pnode
tree for each mount. Its much easier to abstract out the pnode
operations and concentrate on them separately than mixing the
functionality of vfs and pnode in a single vfs datastructure.
In my code, I have abstracted out the pnode tree traversing operation
using a single iterator function pnode_next().
Think about it, and let me know if it is worth the effort of changing
the implementation. I sincerely feel it just shifts complexity instead
of reducing complexity. I can eventually come up with a fully tested
implementation using your idea, but I am still not convinced that
it reduces complexity.
RP
>
> Here's a less buggy (and even more simplified) version of the
> function. Note: it must be called with 'origin' either on a slave
> list or at the root pnode. That's because the function checks if the
> all vfsmounts in a pnode have been traversed by looking at the
> emptiness of mnt_slave. So if origin was in a slave pnode, but is not
> the actual slave link, the algorithm will go off the starting pnode
> and up to it's master.
>
> So here's a preparation function that finds the right place to start
> the propagation.
>
> static struct vfsmount *propagation_first(struct vfsmount *p)
> {
> struct vfsmount *q = p;
>
> while (list_empty(&q->mnt_slave)) {
> q = next_shared(q);
> if (q == p)
> break;
> }
> return q;
> }
>
> static struct vfsmount *propagation_next(struct vfsmount *p,
> struct vfsmount *origin)
> {
> /* are there any slaves of this mount? */
> if (!list_empty(&p->mnt_slave_list))
> return first_slave(p);
>
> while (1) {
> /* if p->mnt_share is empty, this is a no-op */
> p = next_shared(p);
>
> /* finished traversing? */
> if (p == origin)
> break;
>
> /* more vfsmounts belong to the pnode? */
> if (list_empty(&p->mnt_slave))
> return p;
>
> /* more slaves? */
> if (p->mnt_slave.next != &p->mnt_master->mnt_slave_list)
> return next_slave(p);
>
> /* back at master */
> p = p->mnt_master;
> }
>
> return NULL;
> }
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-31 0:45 ` Ram Pai
@ 2005-07-31 7:52 ` Miklos Szeredi
2005-07-31 8:25 ` Miklos Szeredi
0 siblings, 1 reply; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-31 7:52 UTC (permalink / raw)
To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel
> Ok. I have started implementing your idea. But the implementation is no
> simple. Its becomes a complex mess. Atleast in the case of pnode
> datastructure implementation, the propogation was all abstracted and
> concentrated in the pnode datastructure.
>
> Here is a sample implementation of do_make_slave() with your idea.
>
> static int do_make_slave(struct vfsmount *mnt)
> {
> int err=0;
> struct vfsmount *peer_mnt;
>
> spin_lock(&vfspnode_lock);
> if (!IS_MNT_SHARED(mnt)) {
> spin_unlock(&vfspnode_lock);
> err = -EINVAL;
> goto out;
> }
>
> peer_mnt = list_entry(mnt->mnt_share.next, struct vfsmount,
> mnt_share);
> if (peer_mnt == mnt)
> peer_mnt = NULL;
>
> list_del_init(&mnt->mnt_share);
> if (peer_mnt) {
> /* move the slave list to the peer_mnt */
> list_splice(&mnt->mnt_slave, &peer_mnt->mnt_slave);
> list_add(&mnt->mnt_slave_list, &peer_mnt->mnt_slave);
> set_mnt_slave(mnt);
> } else {
> struct vfsmount *slave_mnt, *t_slave_mnt;
> list_for_each_entry_safe(slave_mnt, t_slave_mnt,
> &mnt->mnt_slave, mnt_slave_list) {
> CLEAR_MNT_SLAVE(slave_mnt);
> list_del_init(&slave_mnt->mnt_slave_list);
> }
> }
> list_del_init(&mnt->mnt_slave);
> mnt->mnt_master = peer_mnt;
> spin_unlock(&vfspnode_lock);
> out:
> return err;
> }
>
> Do you still believe that your idea is simpler?
Well, you have bundled do_make_slave(), pnode_member_to_slave() and
empty_pnode() all into one function. I think your original split is
quite nice. If you'd split this function up like that, I think you'd
agree, that the end result is simpler.
Btw. though you don't have a pnode datastructure, that doesn't mean
you can't separate the shared subtree related functions into a
separate pnode.c (or whatever) file. I think the do_make_* functions
also belong in there, not namespace.c.
> The most difficult part is, attaching shared vfs tree, which needs
> to be attached at someother shared mount point. The problem here is
> while traversing the propogation tree, one has to build another
> similar propogation tree for the new child mount. Its a 2
> dimensional tree walk, i.e one walk is along the vfstree, and the
> other walk is along the pnode tree for each mount. Its much easier
> to abstract out the pnode operations and concentrate on them
> separately than mixing the functionality of vfs and pnode in a
> single vfs datastructure.
I think you are bluring the distinction between two separate concepts:
"structure" and "function".
You can concentrate separate "structures" in a single object, without
having to mix the "functions".
The linux 'struct list' is a wonderful way to do this.
C++ inheritance is another (much less wonderful) way.
Mixing "functions" is bad. Concentrating "structure" is good.
> In my code, I have abstracted out the pnode tree traversing operation
> using a single iterator function pnode_next().
Well, but the pnode traversing is only part of the problem. In the
end you really need to traverse vfsmounts. Pnodes are really just a
conceptual helper. I think Al Viro didn't make that clear enough in
his RFC.
> Think about it, and let me know if it is worth the effort of changing
> the implementation. I sincerely feel it just shifts complexity instead
> of reducing complexity. I can eventually come up with a fully tested
> implementation using your idea, but I am still not convinced that
> it reduces complexity.
I really don't want to force it on you. If you feel comfortable with
the current code, that's OK. But keep in mind, that this is a central
part of the kernel, and so the code needs to be understood to the last
little detail, not just by you. And doing that with >1000 new lines
of code is not an easy task.
So the less code (and less complexity) you are able to make this
functionality work, the easier it will be for others to understand and
accept it.
Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/7] shared subtree
2005-07-31 7:52 ` Miklos Szeredi
@ 2005-07-31 8:25 ` Miklos Szeredi
0 siblings, 0 replies; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-31 8:25 UTC (permalink / raw)
To: linuxram; +Cc: akpm, viro, mathurav, mike, janak, linux-fsdevel, linux-kernel
> > Do you still believe that your idea is simpler?
>
> Well, you have bundled do_make_slave(), pnode_member_to_slave() and
> empty_pnode() all into one function. I think your original split is
> quite nice. If you'd split this function up like that, I think you'd
> agree, that the end result is simpler.
Also you can still use the pnode concept in naming functions and
explanations. For example empty_pnode() is a good function name even
if there's no 'struct pnode'. Pnodes still exist, they just don't
have a corresponding object.
Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread
* (no subject)
2005-07-25 22:44 Ram Pai
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-25 22:44 ` Ram Pai
` (5 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2/7] shared subtree
Content-Type: text/x-patch; name=unclone.patch
Content-Disposition: inline; filename=unclone.patch
Adds the ability to unclone a vfs tree. A uncloned vfs tree will not be
clonnable, and hence cannot be bind/rbind to any other mountpoint.
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 15 ++++++++++++++-
include/linux/fs.h | 1 +
include/linux/mount.h | 15 +++++++++++++++
3 files changed, 30 insertions(+), 1 deletion(-)
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -673,6 +673,14 @@ static int do_make_private(struct vfsmou
return 0;
}
+static int do_make_unclone(struct vfsmount *mnt)
+{
+ if(mnt->mnt_pnode)
+ pnode_disassociate_mnt(mnt);
+ set_mnt_unclone(mnt);
+ return 0;
+}
+
/*
* recursively change the type of the mountpoint.
*/
@@ -682,6 +690,7 @@ static int do_change_type(struct nameida
int err=0;
if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+ && !(flag & MS_UNCLONE)
&& !(flag & MS_SLAVE))
return -EINVAL;
@@ -700,6 +709,9 @@ static int do_change_type(struct nameida
case MS_PRIVATE:
err = do_make_private(m);
break;
+ case MS_UNCLONE:
+ err = do_make_unclone(m);
+ break;
}
}
spin_unlock(&vfsmount_lock);
@@ -1140,7 +1152,8 @@ long do_mount(char * dev_name, char * di
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
- else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+ else if (flags & MS_SHARED || flags & MS_UNCLONE ||
+ flags & MS_PRIVATE || flags & MS_SLAVE)
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -102,6 +102,7 @@ extern int dir_notify_enable;
#define MS_MOVE 8192
#define MS_REC 16384
#define MS_VERBOSE 32768
+#define MS_UNCLONE (1<<17) /* recursively change to unclonnable */
#define MS_PRIVATE (1<<18) /* recursively change to private */
#define MS_SLAVE (1<<19) /* recursively change to slave */
#define MS_SHARED (1<<20) /* recursively change to shared */
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -22,15 +22,18 @@
#define MNT_PRIVATE 0x10 /* if the vfsmount is private, by default it is private*/
#define MNT_SLAVE 0x20 /* if the vfsmount is a slave mount of its pnode */
#define MNT_SHARED 0x40 /* if the vfsmount is a slave mount of its pnode */
+#define MNT_UNCLONE 0x80 /* if the vfsmount is unclonable */
#define MNT_PNODE_MASK 0xf0 /* propogation flag mask */
#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
#define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+#define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
#define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
#define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
+#define CLEAR_MNT_UNCLONE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_UNCLONE))
struct vfsmount
{
@@ -59,6 +62,7 @@ static inline void set_mnt_shared(struct
mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
CLEAR_MNT_PRIVATE(mnt);
CLEAR_MNT_SLAVE(mnt);
+ CLEAR_MNT_UNCLONE(mnt);
}
static inline void set_mnt_private(struct vfsmount *mnt)
@@ -66,6 +70,16 @@ static inline void set_mnt_private(struc
mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
CLEAR_MNT_SLAVE(mnt);
CLEAR_MNT_SHARED(mnt);
+ CLEAR_MNT_UNCLONE(mnt);
+ mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_unclone(struct vfsmount *mnt)
+{
+ mnt->mnt_flags |= MNT_PNODE_MASK & MNT_UNCLONE;
+ CLEAR_MNT_SLAVE(mnt);
+ CLEAR_MNT_SHARED(mnt);
+ CLEAR_MNT_PRIVATE(mnt);
mnt->mnt_pnode = NULL;
}
@@ -74,6 +88,7 @@ static inline void set_mnt_slave(struct
mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SLAVE;
CLEAR_MNT_PRIVATE(mnt);
CLEAR_MNT_SHARED(mnt);
+ CLEAR_MNT_UNCLONE(mnt);
}
static inline struct vfsmount *mntget(struct vfsmount *mnt)
^ permalink raw reply [flat|nested] 20+ messages in thread* (no subject)
2005-07-25 22:44 Ram Pai
2005-07-25 22:44 ` Ram Pai
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-27 19:13 ` [PATCH 3/7] shared subtree Miklos Szeredi
2005-07-25 22:44 ` Ram Pai
` (4 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 3/7] shared subtree
Content-Type: text/x-patch; name=rbind.patch
Content-Disposition: inline; filename=rbind.patch
Adds the ability to bind/rbind a shared/private/slave subtree and set up
propogation wherever needed.
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 660 ++++++++++++++++++++++++++++++++++++++++------
fs/pnode.c | 235 ++++++++++++++++
include/linux/dcache.h | 2
include/linux/fs.h | 5
include/linux/namespace.h | 1
5 files changed, 826 insertions(+), 77 deletions(-)
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -42,7 +42,8 @@ static inline int sysfs_init(void)
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
-static kmem_cache_t *mnt_cache;
+static kmem_cache_t *mnt_cache;
+static struct rw_semaphore namespace_sem;
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -54,7 +55,7 @@ static inline unsigned long hash(struct
struct vfsmount *alloc_vfsmnt(const char *name)
{
- struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
+ struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
if (mnt) {
memset(mnt, 0, sizeof(struct vfsmount));
atomic_set(&mnt->mnt_count,1);
@@ -86,7 +87,8 @@ void free_vfsmnt(struct vfsmount *mnt)
* Now, lookup_mnt increments the ref count before returning
* the vfsmount struct.
*/
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+ struct dentry *root)
{
struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head;
@@ -99,7 +101,8 @@ struct vfsmount *lookup_mnt(struct vfsmo
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
- if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+ if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry &&
+ (root == NULL || p->mnt_root == root)) {
found = mntget(p);
break;
}
@@ -108,6 +111,37 @@ struct vfsmount *lookup_mnt(struct vfsmo
return found;
}
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+ return __lookup_mnt(mnt, dentry, NULL);
+}
+
+static struct vfsmount *
+clone_mnt(struct vfsmount *old, struct dentry *root)
+{
+ struct super_block *sb = old->mnt_sb;
+ struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+
+ if (mnt) {
+ mnt->mnt_flags = old->mnt_flags;
+ atomic_inc(&sb->s_active);
+ mnt->mnt_sb = sb;
+ mnt->mnt_root = dget(root);
+ mnt->mnt_mountpoint = mnt->mnt_root;
+ mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = old->mnt_namespace;
+ mnt->mnt_pnode = get_pnode(old->mnt_pnode);
+
+ /* stick the duplicate mount on the same expiry list
+ * as the original if that was on one */
+ spin_lock(&vfsmount_lock);
+ if (!list_empty(&old->mnt_fslink))
+ list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+ spin_unlock(&vfsmount_lock);
+ }
+ return mnt;
+}
+
static inline int check_mnt(struct vfsmount *mnt)
{
return mnt->mnt_namespace == current->namespace;
@@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
{
mnt->mnt_parent = mntget(nd->mnt);
mnt->mnt_mountpoint = dget(nd->dentry);
- list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+ mnt->mnt_namespace = nd->mnt->mnt_namespace;
+ list_add_tail(&mnt->mnt_hash,
+ mount_hashtable+hash(nd->mnt, nd->dentry));
list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
nd->dentry->d_mounted++;
}
+static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+ mnt->mnt_parent = mntget(nd->mnt);
+ mnt->mnt_mountpoint = dget(nd->dentry);
+ nd->dentry->d_mounted++;
+}
+
+
+void do_attach_commit_mnt(struct vfsmount *mnt)
+{
+ struct vfsmount *parent = mnt->mnt_parent;
+ BUG_ON(parent==mnt);
+ if(list_empty(&mnt->mnt_hash))
+ list_add_tail(&mnt->mnt_hash,
+ mount_hashtable+hash(parent, mnt->mnt_mountpoint));
+ if(list_empty(&mnt->mnt_child))
+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ mnt->mnt_namespace = parent->mnt_namespace;
+ list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
+}
+
+struct vfsmount *do_attach_prepare_mnt(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct vfsmount *template_mnt,
+ int clone_flag)
+{
+ struct vfsmount *child_mnt;
+ struct nameidata nd;
+
+ if (clone_flag) {
+ if(!(child_mnt = clone_mnt(template_mnt,
+ template_mnt->mnt_root)))
+ return NULL;
+ } else
+ child_mnt = template_mnt;
+
+ nd.mnt = mnt;
+ nd.dentry = dentry;
+
+ attach_prepare_mnt(child_mnt, &nd);
+
+ return child_mnt;
+}
+
+void do_detach_prepare_mnt(struct vfsmount *mnt, int free_flag)
+{
+ mnt->mnt_mountpoint->d_mounted--;
+ mntput(mnt->mnt_parent);
+ dput(mnt->mnt_mountpoint);
+ if (free_flag) {
+ BUG_ON(atomic_read(&mnt->mnt_count) != 1);
+ spin_lock(&vfsmount_lock);
+ list_del_init(&mnt->mnt_fslink);
+ spin_unlock(&vfsmount_lock);
+ mntput(mnt);
+ }
+}
+
static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
{
struct list_head *next = p->mnt_mounts.next;
@@ -149,29 +243,14 @@ static struct vfsmount *next_mnt(struct
return list_entry(next, struct vfsmount, mnt_child);
}
-static struct vfsmount *
-clone_mnt(struct vfsmount *old, struct dentry *root)
+static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
{
- struct super_block *sb = old->mnt_sb;
- struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
-
- if (mnt) {
- mnt->mnt_flags = old->mnt_flags;
- atomic_inc(&sb->s_active);
- mnt->mnt_sb = sb;
- mnt->mnt_root = dget(root);
- mnt->mnt_mountpoint = mnt->mnt_root;
- mnt->mnt_parent = mnt;
- mnt->mnt_namespace = old->mnt_namespace;
-
- /* stick the duplicate mount on the same expiry list
- * as the original if that was on one */
- spin_lock(&vfsmount_lock);
- if (!list_empty(&old->mnt_fslink))
- list_add(&mnt->mnt_fslink, &old->mnt_fslink);
- spin_unlock(&vfsmount_lock);
+ struct list_head *prev = p->mnt_mounts.prev;
+ while (prev != &p->mnt_mounts) {
+ p = list_entry(prev, struct vfsmount, mnt_child);
+ prev = p->mnt_mounts.prev;
}
- return mnt;
+ return p;
}
void __mntput(struct vfsmount *mnt)
@@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
struct list_head *p;
loff_t l = *pos;
- down_read(&n->sem);
+ down_read(&namespace_sem);
list_for_each(p, &n->list)
if (!l--)
return list_entry(p, struct vfsmount, mnt_list);
@@ -208,8 +287,7 @@ static void *m_next(struct seq_file *m,
static void m_stop(struct seq_file *m, void *v)
{
- struct namespace *n = m->private;
- up_read(&n->sem);
+ up_read(&namespace_sem);
}
static inline void mangle(struct seq_file *m, const char *s)
@@ -433,7 +511,7 @@ static int do_umount(struct vfsmount *mn
return retval;
}
- down_write(¤t->namespace->sem);
+ down_write(&namespace_sem);
spin_lock(&vfsmount_lock);
if (atomic_read(&sb->s_active) == 1) {
@@ -455,7 +533,7 @@ static int do_umount(struct vfsmount *mn
spin_unlock(&vfsmount_lock);
if (retval)
security_sb_umount_busy(mnt);
- up_write(¤t->namespace->sem);
+ up_write(&namespace_sem);
return retval;
}
@@ -495,9 +573,9 @@ out:
#ifdef __ARCH_WANT_SYS_OLDUMOUNT
/*
- * The 2.0 compatible umount. No flags.
+ * The 2.0 compatible umount. No flags.
*/
-
+
asmlinkage long sys_oldumount(char __user * name)
{
return sys_umount(name,0);
@@ -541,6 +619,9 @@ static struct vfsmount *copy_tree(struct
struct list_head *h;
struct nameidata nd;
+ if (IS_MNT_UNCLONE(mnt))
+ return NULL;
+
res = q = clone_mnt(mnt, dentry);
if (!q)
goto Enomem;
@@ -549,10 +630,15 @@ static struct vfsmount *copy_tree(struct
p = mnt;
for (h = mnt->mnt_mounts.next; h != &mnt->mnt_mounts; h = h->next) {
r = list_entry(h, struct vfsmount, mnt_child);
+
if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry))
continue;
for (s = r; s; s = next_mnt(s, r)) {
+ if (IS_MNT_UNCLONE(s)) {
+ s = skip_mnt_tree(s);
+ continue;
+ }
while (p != s->mnt_parent) {
p = p->mnt_parent;
q = q->mnt_parent;
@@ -579,9 +665,276 @@ static struct vfsmount *copy_tree(struct
return NULL;
}
+/*
+ * return 1 if the mount tree contains a shared or slave mount
+ */
+static inline int tree_contains_sharedorslave(struct vfsmount *mnt)
+{
+ struct vfsmount *p;
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ if (IS_MNT_SHARED(p) || IS_MNT_SLAVE(p))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * commit the operations done in attach_recursive_mnt(). run through pnode list
+ * headed at 'pnodehead', and commit the operation done in
+ * attach_recursive_mnt();
+ */
+
+static void commit_attach_recursive_mnt(struct list_head *pnodehead)
+{
+ struct vfspnode *t_p, *tmp_pnode;
+
+ /*
+ * Merge or delete or slave each of the temporary pnode
+ */
+ spin_lock(&vfsmount_lock);
+ list_for_each_entry_safe(tmp_pnode, t_p, pnodehead,
+ pnode_peer_slave) {
+
+ int del_flag = IS_PNODE_DELETE(tmp_pnode);
+ int slave_flag = IS_PNODE_SLAVE(tmp_pnode);
+ struct vfspnode *master_pnode = tmp_pnode->pnode_master;
+
+ list_del_init(&tmp_pnode->pnode_peer_slave);
+ pnode_commit_mount(tmp_pnode, del_flag);
+
+ if (!del_flag && master_pnode) {
+ tmp_pnode->pnode_master = NULL;
+
+ if (slave_flag)
+ pnode_add_slave_pnode(master_pnode, tmp_pnode);
+ else
+ pnode_merge_pnode(tmp_pnode, master_pnode);
+
+ /*
+ * we don't need the extra reference to
+ * the master_pnode, which was created either
+ * (a) pnode_add_slave_pnode: when the mnt
+ * was made as a slave mnt.
+ * (b) pnode_merge_pnode: during clone_mnt().
+ */
+ put_pnode(master_pnode);
+ }
+ }
+ spin_unlock(&vfsmount_lock);
+}
+
+/*
+ * abort the operations done in attach_recursive_mnt(). run through the mount
+ * tree, till vfsmount 'last' and undo the changes. Ensure that all the mounts
+ * in the tree are all back in the mnt_list headed at 'source_mnt'.
+ * NOTE: This function is closely tied to the logic in
+ * 'attach_recursive_mnt()'
+ */
+static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
+ vfsmount *last, struct list_head *head) { struct vfsmount *p =
+ source_mnt, *m; struct vfspnode *src_pnode;
+
+ if (!last)
+ return;
+
+ do {
+ int is_unclone, is_pnode_slave;
+
+ m = p;
+ is_unclone = IS_MNT_UNCLONE(m);
+
+ BUG_ON(!m->mnt_pnode);
+
+ is_pnode_slave = IS_PNODE_SLAVE(m->mnt_pnode);
+ src_pnode = m->mnt_pnode->pnode_master;
+ m->mnt_pnode->pnode_master = NULL;
+ pnode_abort_mount(m->mnt_pnode, m);
+
+ m->mnt_pnode = src_pnode;
+ if (src_pnode) {
+ if(is_pnode_slave)
+ set_mnt_slave(m);
+ else
+ set_mnt_shared(m);
+ } else {
+ if (is_unclone)
+ set_mnt_unclone(m);
+ else
+ set_mnt_private(m);
+ }
+
+
+ list_add_tail(&m->mnt_list, head);
+ p = next_mnt(m, source_mnt);
+
+ } while ( p && m != last );
+ source_mnt->mnt_parent = source_mnt;
+ list_del_init(head);
+}
+
+ /*
+ * @source_mnt : mount tree to be attached
+ * @nd : place the mount tree @source_mnt is attached
+ *
+ * NOTE: in the table below explains the semantics when a source vfsmount
+ * of a given type is attached to a destination vfsmount of a give type.
+ * ---------------------------------------------------------------------
+ * | BIND MOUNT OPERATION |
+ * |*******************************************************************|
+ * | dest --> | shared | private | slave |unclonable |
+ * | source | | | | |
+ * | | | | | | |
+ * | v | | | | |
+ * |*******************************************************************|
+ * | | | | | |
+ * | shared | shared (++) | shared (+)|shared (+)| shared (+)|
+ * | | | | | |
+ * | | | | | |
+ * | private | shared (+) | private | private | private |
+ * | | | | | |
+ * | | | | | |
+ * | slave | shared (+) | private | private | private |
+ * | | | | | |
+ * | | | | | |
+ * | unclonable| nomount | nomount | nomount | nomount |
+ * | | | | | |
+ * | | | | | |
+ * ********************************************************************
+ *
+ * (++) the mount will be propogated to all the vfsmounts in the pnode tree
+ * of the destination vfsmount, and all the non-slave new mounts in
+ * destination vfsmount will be added the source vfsmount's pnode.
+ * (+) the mount will be propogated to the destination vfsmount
+ * and the new mount will be added to the source vfsmount's pnode.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each vfsmount in the tree.
+ *
+ * Should be called without spinlocks held, because this function can sleep
+ * in allocations.
+ *
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+ struct nameidata *nd)
+{
+ struct vfsmount *mntpt_mnt, *last, *m, *p;
+ struct vfspnode *src_pnode, *dest_pnode, *tmp_pnode;
+ struct dentry *mntpt_dentry;
+ int ret;
+ LIST_HEAD(pnodehead);
+ LIST_HEAD(mnt_list_head);
+
+ /*
+ * if the source tree has no shared or slave mounts and
+ * the destination mount is not shared, fastpath.
+ */
+ mntpt_mnt = nd->mnt;
+ dest_pnode = IS_MNT_SHARED(mntpt_mnt) ? mntpt_mnt->mnt_pnode : NULL;
+ if (!dest_pnode && !tree_contains_sharedorslave(source_mnt)) {
+ spin_lock(&vfsmount_lock);
+ attach_mnt(source_mnt, nd);
+ list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
+ list_splice(&mnt_list_head, mntpt_mnt->mnt_namespace->list.prev);
+ spin_unlock(&vfsmount_lock);
+ goto out;
+ }
+
+ /*
+ * Create temporary pnodes which shall hold all the new
+ * mounts. Merge or delete or slave that pnode later in a separate
+ * operation, depending on the type of source and destination mounts.
+ */
+ p = NULL;
+ last = NULL;
+ list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
+
+ for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
+
+ BUG_ON(IS_MNT_UNCLONE(m));
+
+ while (p && p != m->mnt_parent)
+ p = p->mnt_parent;
+
+ if (!p) {
+ mntpt_dentry = nd->dentry;
+ mntpt_mnt = nd->mnt;
+ } else {
+ mntpt_dentry = m->mnt_mountpoint;
+ mntpt_mnt = p;
+ }
+ p=m;
+
+ dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
+ mntpt_mnt->mnt_pnode : NULL;
+ src_pnode = (IS_MNT_SHARED(m))?
+ m->mnt_pnode : NULL;
+
+ /*
+ * get a temporary pnode into which add the new vfs, and keep
+ * track of these pnodes and their real pnode.
+ */
+ if (!(tmp_pnode = pnode_alloc())) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ m->mnt_pnode = NULL;
+ list_del_init(&m->mnt_list);
+ list_add_tail(&tmp_pnode->pnode_peer_slave, &pnodehead);
+
+ if (dest_pnode) {
+ if ((ret = pnode_prepare_mount(dest_pnode, tmp_pnode,
+ mntpt_dentry, m, mntpt_mnt))) {
+ tmp_pnode->pnode_master = src_pnode;
+ m->mnt_pnode = tmp_pnode;
+ last = m;
+ goto error;
+ }
+ } else {
+ if (m == m->mnt_parent)
+ do_attach_prepare_mnt(mntpt_mnt,
+ mntpt_dentry, m, 0);
+ pnode_add_member_mnt(tmp_pnode, m);
+ if (!src_pnode) {
+ set_mnt_private(m);
+ /*
+ * NOTE: set_mnt_private()
+ * resets m->mnt_pnode.
+ * Reinitialize it. This is needed to
+ * decrement the refcount on the
+ * pnode when the mount 'm' is
+ * unlinked in pnode_commit_mount().
+ */
+ m->mnt_pnode = tmp_pnode;
+ SET_PNODE_DELETE(tmp_pnode);
+ }
+ }
+
+ /*
+ * temporarily track the pnode with which the tmp_pnode
+ * has to merge with; in the pnode_master field.
+ */
+ tmp_pnode->pnode_master = src_pnode;
+ last = m;
+ }
+ commit_attach_recursive_mnt(&pnodehead);
+out:
+ mntget(source_mnt);
+ return 0;
+error:
+ /*
+ * ok we have errored out either because of memory exhaustion
+ * or something else not in our control. Gracefully return
+ * leaving no mess behind. Else it will haunt you. :(
+ */
+ abort_attach_recursive_mnt(source_mnt, last, &mnt_list_head);
+ return 1;
+}
+
static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
- int err;
+ int err, ret;
+
if (mnt->mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
@@ -599,17 +952,12 @@ static int graft_tree(struct vfsmount *m
goto out_unlock;
err = -ENOENT;
- spin_lock(&vfsmount_lock);
- if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
- struct list_head head;
- attach_mnt(mnt, nd);
- list_add_tail(&head, &mnt->mnt_list);
- list_splice(&head, current->namespace->list.prev);
- mntget(mnt);
- err = 0;
- }
+ spin_lock(&vfsmount_lock);
+ ret = (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry));
spin_unlock(&vfsmount_lock);
+ if (ret)
+ err = attach_recursive_mnt(mnt, nd);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
@@ -681,6 +1029,147 @@ static int do_make_unclone(struct vfsmou
return 0;
}
+ /*
+ * This operation is equivalent of mount --bind dir dir
+ * create a new mount at the dentry, and unmount all child mounts
+ * mounted on top of dentries below 'dentry', and mount them
+ * under the new mount.
+ */
+struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+ struct vfsmount *child_mnt, *next;
+ struct nameidata nd;
+ struct vfsmount *newmnt = clone_mnt(mnt, dentry);
+ LIST_HEAD(head);
+
+ /*
+ * note clone_mnt() gets a reference to the pnode.
+ * we won't use that pnode anyway. So just let it
+ * go
+ */
+ put_pnode(newmnt->mnt_pnode);
+ newmnt->mnt_pnode = NULL;
+
+ if (newmnt) {
+ /*
+ * walk through the mount list of mnt and move
+ * them under the new mount
+ */
+ spin_lock(&vfsmount_lock);
+ list_del_init(&newmnt->mnt_fslink);
+
+ list_for_each_entry_safe(child_mnt, next,
+ &mnt->mnt_mounts, mnt_child) {
+
+ if(child_mnt->mnt_mountpoint == dentry)
+ continue;
+
+ if(!is_subdir(child_mnt->mnt_mountpoint, dentry))
+ continue;
+
+ detach_mnt(child_mnt, &nd);
+ nd.mnt = newmnt;
+ attach_mnt(child_mnt, &nd);
+ }
+
+ nd.mnt = mnt;
+ nd.dentry = dentry;
+ attach_mnt(newmnt, &nd);
+ list_add_tail(&newmnt->mnt_list, &newmnt->mnt_namespace->list);
+ spin_unlock(&vfsmount_lock);
+ }
+ return newmnt;
+}
+
+ /*
+ * Inverse operation of do_make_mounted()
+ */
+int do_make_unmounted(struct vfsmount *mnt)
+{
+ struct vfsmount *parent_mnt, *child_mnt, *next;
+ struct nameidata nd;
+
+ /* validate if mount has a different parent */
+ parent_mnt = mnt->mnt_parent;
+ if (mnt == parent_mnt)
+ return 0;
+ /*
+ * cannot unmount a mount that is not created
+ * as a overlay mount.
+ */
+ if (mnt->mnt_mountpoint != mnt->mnt_root)
+ return -EINVAL;
+
+ /* for each submounts in the parent, put the mounts back */
+ spin_lock(&vfsmount_lock);
+ list_for_each_entry_safe(child_mnt, next, &mnt->mnt_mounts, mnt_child) {
+ detach_mnt(child_mnt, &nd);
+ nd.mnt = parent_mnt;
+ attach_mnt(child_mnt, &nd);
+ }
+ detach_mnt(mnt, &nd);
+ spin_unlock(&vfsmount_lock);
+ return 0;
+}
+
+/*
+ * @nd: contains the vfsmount and the dentry where the new mount
+ * is the be created
+ * @mnt: returns the newly created mount.
+ * Create a new mount at the location specified by 'nd' and
+ * propogate the mount to all other mounts if the mountpoint
+ * is under a shared mount.
+ */
+int make_mounted(struct nameidata *nd, struct vfsmount **mnt)
+{
+ struct vfsmount *parent_mnt;
+ struct dentry *parent_dentry;
+ int err = mount_is_safe(nd);
+ if (err)
+ return err;
+ parent_dentry = nd->dentry;
+ parent_mnt = nd->mnt;
+ /*
+ * check if dentry already has a vfsmount
+ * if it does not, create and attach
+ * a new vfsmount at that dentry.
+ * Also propogate the mount if parent_mnt
+ * is shared.
+ */
+ if(parent_dentry != parent_mnt->mnt_root) {
+ *mnt = IS_MNT_SHARED(parent_mnt) ?
+ pnode_make_mounted(parent_mnt->mnt_pnode,
+ parent_mnt, parent_dentry) :
+ do_make_mounted(parent_mnt, parent_dentry);
+ if (!*mnt)
+ err = -ENOMEM;
+ } else
+ *mnt = parent_mnt;
+ return err;
+}
+
+ /*
+ * Inverse operation of make_mounted()
+ */
+int make_unmounted(struct vfsmount *mnt)
+{
+ if (mnt == mnt->mnt_parent)
+ return 0;
+ /*
+ * cannot unmount a mount that is not created
+ * as a overlay mount.
+ */
+ if (mnt->mnt_mountpoint != mnt->mnt_root)
+ return -EINVAL;
+
+ if (IS_MNT_SHARED(mnt))
+ pnode_make_unmounted(mnt->mnt_pnode);
+ else
+ do_make_unmounted(mnt);
+
+ return 0;
+}
+
/*
* recursively change the type of the mountpoint.
*/
@@ -724,7 +1213,7 @@ static int do_change_type(struct nameida
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
struct nameidata old_nd;
- struct vfsmount *mnt = NULL;
+ struct vfsmount *mnt = NULL, *overlay_mnt=NULL;
int err = mount_is_safe(nd);
if (err)
return err;
@@ -734,14 +1223,31 @@ static int do_loopback(struct nameidata
if (err)
return err;
- down_write(¤t->namespace->sem);
+ if (IS_MNT_UNCLONE(old_nd.mnt)) {
+ err = -EINVAL;
+ goto path_release;
+ }
+
+ down_write(&namespace_sem);
err = -EINVAL;
if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+
+ /*
+ * If the dentry is not the root dentry, and if a bind
+ * from a shared subtree is attempted, create a mount
+ * at the dentry, and use the new mount as the starting
+ * point for the bind/rbind operation.
+ */
+ overlay_mnt = old_nd.mnt;
+ if(IS_MNT_SHARED(old_nd.mnt) &&
+ (err = make_mounted(&old_nd, &overlay_mnt)))
+ goto out;
+
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(overlay_mnt, old_nd.dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(overlay_mnt, old_nd.dentry);
}
if (mnt) {
@@ -752,15 +1258,25 @@ static int do_loopback(struct nameidata
err = graft_tree(mnt, nd);
if (err) {
- spin_lock(&vfsmount_lock);
- umount_tree(mnt);
- spin_unlock(&vfsmount_lock);
- } else
- mntput(mnt);
- }
+ spin_lock(&vfsmount_lock);
+ umount_tree(mnt);
+ spin_unlock(&vfsmount_lock);
+ /*
+ * ok we failed! so undo any overlay
+ * mount that we did earlier.
+ */
+ if (old_nd.mnt != overlay_mnt)
+ make_unmounted(overlay_mnt);
+ } else
+ mntput(mnt);
+ }
+
+ out:
+ up_write(&namespace_sem);
+
+ path_release:
+ path_release(&old_nd);
- up_write(¤t->namespace->sem);
- path_release(&old_nd);
return err;
}
@@ -808,7 +1324,7 @@ static int do_move_mount(struct nameidat
if (err)
return err;
- down_write(¤t->namespace->sem);
+ down_write(&namespace_sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
@@ -852,7 +1368,7 @@ out2:
out1:
up(&nd->dentry->d_inode->i_sem);
out:
- up_write(¤t->namespace->sem);
+ up_write(&namespace_sem);
if (!err)
path_release(&parent_nd);
path_release(&old_nd);
@@ -891,7 +1407,7 @@ int do_add_mount(struct vfsmount *newmnt
{
int err;
- down_write(¤t->namespace->sem);
+ down_write(&namespace_sem);
/* Something was mounted here while we slept */
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
@@ -920,7 +1436,7 @@ int do_add_mount(struct vfsmount *newmnt
}
unlock:
- up_write(¤t->namespace->sem);
+ up_write(&namespace_sem);
mntput(newmnt);
return err;
}
@@ -976,7 +1492,7 @@ void mark_mounts_for_expiry(struct list_
get_namespace(namespace);
spin_unlock(&vfsmount_lock);
- down_write(&namespace->sem);
+ down_write(&namespace_sem);
spin_lock(&vfsmount_lock);
/* check that it is still dead: the count should now be 2 - as
@@ -1020,7 +1536,7 @@ void mark_mounts_for_expiry(struct list_
spin_unlock(&vfsmount_lock);
}
- up_write(&namespace->sem);
+ up_write(&namespace_sem);
mntput(mnt);
put_namespace(namespace);
@@ -1066,7 +1582,7 @@ int copy_mount_options(const void __user
int i;
unsigned long page;
unsigned long size;
-
+
*where = 0;
if (!data)
return 0;
@@ -1085,7 +1601,7 @@ int copy_mount_options(const void __user
i = size - exact_copy_from_user((void *)page, data, size);
if (!i) {
- free_page(page);
+ free_page(page);
return -EFAULT;
}
if (i != PAGE_SIZE)
@@ -1191,14 +1707,13 @@ int copy_namespace(int flags, struct tas
goto out;
atomic_set(&new_ns->count, 1);
- init_rwsem(&new_ns->sem);
INIT_LIST_HEAD(&new_ns->list);
- down_write(&tsk->namespace->sem);
+ down_write(&namespace_sem);
/* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
if (!new_ns->root) {
- up_write(&tsk->namespace->sem);
+ up_write(&namespace_sem);
kfree(new_ns);
goto out;
}
@@ -1232,7 +1747,7 @@ int copy_namespace(int flags, struct tas
p = next_mnt(p, namespace->root);
q = next_mnt(q, new_ns->root);
}
- up_write(&tsk->namespace->sem);
+ up_write(&namespace_sem);
tsk->namespace = new_ns;
@@ -1414,7 +1929,7 @@ asmlinkage long sys_pivot_root(const cha
user_nd.mnt = mntget(current->fs->rootmnt);
user_nd.dentry = dget(current->fs->root);
read_unlock(¤t->fs->lock);
- down_write(¤t->namespace->sem);
+ down_write(&namespace_sem);
down(&old_nd.dentry->d_inode->i_sem);
error = -EINVAL;
if (!check_mnt(user_nd.mnt))
@@ -1460,7 +1975,7 @@ asmlinkage long sys_pivot_root(const cha
path_release(&parent_nd);
out2:
up(&old_nd.dentry->d_inode->i_sem);
- up_write(¤t->namespace->sem);
+ up_write(&namespace_sem);
path_release(&user_nd);
path_release(&old_nd);
out1:
@@ -1487,7 +2002,6 @@ static void __init init_mount_tree(void)
panic("Can't allocate initial namespace");
atomic_set(&namespace->count, 1);
INIT_LIST_HEAD(&namespace->list);
- init_rwsem(&namespace->sem);
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
mnt->mnt_namespace = namespace;
@@ -1510,6 +2024,8 @@ void __init mnt_init(unsigned long mempa
unsigned int nr_hash;
int i;
+ init_rwsem(&namespace_sem);
+
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
@@ -1557,7 +2073,7 @@ void __put_namespace(struct namespace *n
{
struct vfsmount *mnt;
- down_write(&namespace->sem);
+ down_write(&namespace_sem);
spin_lock(&vfsmount_lock);
list_for_each_entry(mnt, &namespace->list, mnt_list) {
@@ -1566,6 +2082,6 @@ void __put_namespace(struct namespace *n
umount_tree(namespace->root);
spin_unlock(&vfsmount_lock);
- up_write(&namespace->sem);
+ up_write(&namespace_sem);
kfree(namespace);
}
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -26,7 +26,6 @@
#include <asm/unistd.h>
#include <stdarg.h>
-
static kmem_cache_t * pnode_cachep;
/* spinlock for pnode related operations */
@@ -90,10 +89,12 @@ static void inline pnode_add_mnt(struct
mnt->mnt_pnode = pnode;
if (slave) {
set_mnt_slave(mnt);
- list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_slavevfs);
+ list_add(&mnt->mnt_pnode_mntlist,
+ &pnode->pnode_slavevfs);
} else {
set_mnt_shared(mnt);
- list_add(&mnt->mnt_pnode_mntlist, &pnode->pnode_vfs);
+ list_add(&mnt->mnt_pnode_mntlist,
+ &pnode->pnode_vfs);
}
get_pnode(pnode);
spin_unlock(&vfspnode_lock);
@@ -111,7 +112,6 @@ void pnode_add_slave_mnt(struct vfspnode
pnode_add_mnt(pnode, mnt, 1);
}
-
void pnode_add_slave_pnode(struct vfspnode *pnode,
struct vfspnode *slave_pnode)
{
@@ -439,3 +439,230 @@ error:
pnode_end(&context);
goto out;
}
+
+int pnode_mount_func(struct vfspnode *pnode, void *indata,
+ void **outdata, va_list args)
+{
+ struct vfspnode *pnode_slave, *pnode_master;
+ int ret=0;
+
+ pnode_master = indata;
+
+ if (*outdata)
+ pnode_slave = *outdata;
+ else if (!(pnode_slave = pnode_alloc()))
+ return -ENOMEM;
+
+ *outdata = pnode_slave;
+
+ if (pnode_slave && pnode_master)
+ pnode_add_slave_pnode(pnode_master, pnode_slave);
+ return ret;
+}
+
+int vfs_make_mounted_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ struct dentry *target_dentry;
+ int ret=0;
+ struct vfsmount *child_mount;
+ struct vfspnode *pnode;
+
+ target_dentry = va_arg(args, struct dentry *);
+ if (!(child_mount = do_make_mounted(mnt, target_dentry))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pnode = (struct vfspnode *)indata;
+ switch (flag) {
+ case PNODE_SLAVE_VFS :
+ pnode_add_slave_mnt(pnode, child_mount);
+ break;
+ case PNODE_MEMBER_VFS :
+ pnode_add_member_mnt(pnode, child_mount);
+ break;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * @pnode: pnode that contains the vfsmounts, on which the
+ * new mount is created at dentry 'dentry'
+ * @dentry: the dentry on which the new mount is created
+ * @mnt: return the mount created on this vfsmount
+ * walk through all the vfsmounts belonging to this pnode
+ * as well as its slave pnodes and for each vfsmount create
+ * a new vfsmount at 'dentry'. Return the vfsmount created
+ * at 'dentry' of vfsmount 'mnt'.
+ */
+struct vfsmount *pnode_make_mounted(struct vfspnode *pnode,
+ struct vfsmount *mnt, struct dentry *dentry)
+{
+ struct vfsmount *child_mnt;
+ struct vfspnode *child_pnode;
+
+ if (!(child_pnode = pnode_alloc()))
+ return NULL;
+
+ if (pnode_traverse(pnode, child_pnode, (void *)NULL,
+ pnode_mount_func, NULL, vfs_make_mounted_func,
+ (void *)dentry))
+ goto error;
+ child_mnt = __lookup_mnt(mnt, dentry, dentry);
+ mntput(child_mnt);
+ return child_mnt;
+
+error:
+ pnode_make_unmounted(child_pnode);
+ return NULL;
+}
+
+int vfs_make_unmounted_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ struct vfspnode *pnode;
+ int ret=0;
+
+ if (do_make_unmounted(mnt)) {
+ ret = 1;
+ goto out;
+ }
+
+ pnode = mnt->mnt_pnode;
+ spin_lock(&vfspnode_lock);
+ list_del_init(&mnt->mnt_pnode_mntlist);
+ put_pnode_locked(pnode);
+ spin_unlock(&vfspnode_lock);
+out:
+ return ret;
+}
+
+int pnode_make_unmounted(struct vfspnode *pnode)
+{
+ return pnode_traverse(pnode, NULL, (void *)NULL,
+ NULL, NULL, vfs_make_unmounted_func);
+}
+
+int vfs_prepare_mount_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ struct vfsmount *source_mnt, *child_mnt, *p_mnt;
+ struct dentry *mountpoint_dentry;
+ struct vfspnode *pnode = (struct vfspnode *)indata;
+
+ source_mnt = va_arg(args, struct vfsmount * );
+ mountpoint_dentry = va_arg(args, struct dentry *);
+ p_mnt = va_arg(args, struct vfsmount *);
+
+ if ((p_mnt != mnt) || (source_mnt == source_mnt->mnt_parent)) {
+ child_mnt = do_attach_prepare_mnt(mnt, mountpoint_dentry,
+ source_mnt, (p_mnt != mnt));
+ if (!child_mnt)
+ return -ENOMEM;
+
+ if (child_mnt != source_mnt)
+ put_pnode(source_mnt->mnt_pnode);
+ } else
+ child_mnt = source_mnt;
+
+ switch (flag) {
+ case PNODE_SLAVE_VFS :
+ pnode_add_slave_mnt(pnode, child_mnt);
+ break;
+ case PNODE_MEMBER_VFS :
+ pnode_add_member_mnt(pnode, child_mnt);
+ break;
+ }
+
+ return 0;
+}
+
+int pnode_prepare_mount(struct vfspnode *pnode,
+ struct vfspnode *master_child_pnode,
+ struct dentry *mountpoint_dentry,
+ struct vfsmount *source_mnt,
+ struct vfsmount *mnt)
+{
+ return pnode_traverse(pnode,
+ master_child_pnode,
+ (void *)NULL,
+ pnode_mount_func,
+ NULL,
+ vfs_prepare_mount_func,
+ source_mnt,
+ mountpoint_dentry,
+ mnt);
+}
+
+int pnode_commit_mount_post_func(struct vfspnode *pnode, void *indata,
+ va_list args)
+{
+ if (va_arg(args, int)) {
+ spin_lock(&vfspnode_lock);
+ BUG_ON(!list_empty(&pnode->pnode_vfs));
+ BUG_ON(!list_empty(&pnode->pnode_slavevfs));
+ BUG_ON(!list_empty(&pnode->pnode_slavepnode));
+ list_del_init(&pnode->pnode_peer_slave);
+ put_pnode_locked(pnode);
+ spin_unlock(&vfspnode_lock);
+ }
+ return 0;
+}
+
+int vfs_commit_mount_func(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ BUG_ON(mnt == mnt->mnt_parent);
+ do_attach_commit_mnt(mnt);
+ if (va_arg(args, int)) {
+ spin_lock(&vfspnode_lock);
+ list_del_init(&mnt->mnt_pnode_mntlist);
+ put_pnode_locked(mnt->mnt_pnode);
+ spin_unlock(&vfspnode_lock);
+ mnt->mnt_pnode = NULL;
+ }
+ return 0;
+}
+
+/*
+ * @pnode: walk the propogation tree and complete the
+ * attachments of the child mounts to the parents
+ * correspondingly.
+ * @flag: if set destroy the propogation tree
+ */
+int pnode_commit_mount(struct vfspnode *pnode, int flag)
+{
+ return pnode_traverse(pnode,
+ NULL, (void *)NULL, NULL, pnode_commit_mount_post_func,
+ vfs_commit_mount_func, flag);
+}
+
+int vfs_abort_mount_func(struct vfsmount *mnt,
+ enum pnode_vfs_type flag, void *indata, va_list args)
+
+{
+ struct vfsmount *exception_mnt = va_arg(args, struct vfsmount *);
+ BUG_ON(!mnt->mnt_pnode);
+ pnode_disassociate_mnt(mnt);
+ do_detach_prepare_mnt(mnt, (exception_mnt != mnt));
+ return 0;
+}
+
+/*
+ * clean the propogation tree under pnode, releasing all
+ * the mounts, except exception_mnt
+ * @pnode: the pnode tree to be cleanup unlinking and
+ * releasing all pnodes in the tree as well as
+ * unlinking any mounts, except 'exception_mnt'
+ * @exception_mnt: the mnt to be unlinked from pnode
+ * bug not released.
+ */
+int pnode_abort_mount(struct vfspnode *pnode,
+ struct vfsmount *exception_mnt)
+{
+ return pnode_traverse(pnode,
+ NULL, (void *)NULL, NULL, NULL,
+ vfs_abort_mount_func, exception_mnt);
+}
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -1216,7 +1216,12 @@ extern struct vfsmount *kern_mount(struc
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *do_attach_prepare_mnt(struct vfsmount *,
+ struct dentry *, struct vfsmount *, int);
+extern void do_attach_commit_mnt(struct vfsmount *);
extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
+extern int do_make_unmounted(struct vfsmount *);
+extern void do_detach_prepare_mnt(struct vfsmount *, int);
extern int vfs_statfs(struct super_block *, struct kstatfs *);
Index: 2.6.12.work2/include/linux/namespace.h
===================================================================
--- 2.6.12.work2.orig/include/linux/namespace.h
+++ 2.6.12.work2/include/linux/namespace.h
@@ -9,7 +9,6 @@ struct namespace {
atomic_t count;
struct vfsmount * root;
struct list_head list;
- struct rw_semaphore sem;
};
extern void umount_tree(struct vfsmount *);
Index: 2.6.12.work2/include/linux/dcache.h
===================================================================
--- 2.6.12.work2.orig/include/linux/dcache.h
+++ 2.6.12.work2/include/linux/dcache.h
@@ -329,6 +329,8 @@ static inline int d_mountpoint(struct de
}
extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct vfsmount *__lookup_mnt(struct vfsmount *,
+ struct dentry *, struct dentry *);
extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
extern int sysctl_vfs_cache_pressure;
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 3/7] shared subtree
2005-07-25 22:44 ` Ram Pai
@ 2005-07-27 19:13 ` Miklos Szeredi
2005-07-27 20:30 ` Ram Pai
0 siblings, 1 reply; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-27 19:13 UTC (permalink / raw)
To: linuxram; +Cc: mathurav, mike, janak, linux-fsdevel, linux-kernel
> @@ -54,7 +55,7 @@ static inline unsigned long hash(struct
>
> struct vfsmount *alloc_vfsmnt(const char *name)
> {
> - struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> + struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> if (mnt) {
> memset(mnt, 0, sizeof(struct vfsmount));
> atomic_set(&mnt->mnt_count,1);
Please make whitespace changes a separate patch.
> @@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
> {
> mnt->mnt_parent = mntget(nd->mnt);
> mnt->mnt_mountpoint = dget(nd->dentry);
> - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
> + mnt->mnt_namespace = nd->mnt->mnt_namespace;
> + list_add_tail(&mnt->mnt_hash,
> + mount_hashtable+hash(nd->mnt, nd->dentry));
> list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
> nd->dentry->d_mounted++;
> }
Why list_add_tail()? This changes user visible behavior, and seems
unnecessary.
> +static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
> +{
> + mnt->mnt_parent = mntget(nd->mnt);
> + mnt->mnt_mountpoint = dget(nd->dentry);
> + nd->dentry->d_mounted++;
> +}
> +
> +
You shouldn't add unnecessary newlines. There are a lot of these,
please audit all your patches.
> +void do_attach_commit_mnt(struct vfsmount *mnt)
> +{
> + struct vfsmount *parent = mnt->mnt_parent;
> + BUG_ON(parent==mnt);
BUG_ON(parent == mnt);
> + if(list_empty(&mnt->mnt_hash))
if (list_empty(&mnt->mnt_hash))
> + list_add_tail(&mnt->mnt_hash,
> + mount_hashtable+hash(parent, mnt->mnt_mountpoint));
> + if(list_empty(&mnt->mnt_child))
> + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> + mnt->mnt_namespace = parent->mnt_namespace;
> + list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
> +}
Etc. Maybe you should run Lindent on your changes, but be careful not
to change existing code, even if Lindent would do that!
> @@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
> struct list_head *p;
> loff_t l = *pos;
>
> - down_read(&n->sem);
> + down_read(&namespace_sem);
> list_for_each(p, &n->list)
> if (!l--)
> return list_entry(p, struct vfsmount, mnt_list);
This should be a separate patch. You can just take the one from the
detached trees patch-series.
> +/*
> + * abort the operations done in attach_recursive_mnt(). run through the mount
> + * tree, till vfsmount 'last' and undo the changes. Ensure that all the mounts
> + * in the tree are all back in the mnt_list headed at 'source_mnt'.
> + * NOTE: This function is closely tied to the logic in
> + * 'attach_recursive_mnt()'
> + */
> +static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
> + vfsmount *last, struct list_head *head) { struct vfsmount *p =
> + source_mnt, *m; struct vfspnode *src_pnode;
If you want to do proper error handling, instead of doing rollback, it
seems better to first do anything that can fail (allocations), then do
the actual attaching, which cannot fail. It isn't nice to have
transient states on failure.
> + /*
> + * This operation is equivalent of mount --bind dir dir
> + * create a new mount at the dentry, and unmount all child mounts
> + * mounted on top of dentries below 'dentry', and mount them
> + * under the new mount.
> + */
> +struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)
Why is this needed? I thought we agreed, that this can be removed.
Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 3/7] shared subtree
2005-07-27 19:13 ` [PATCH 3/7] shared subtree Miklos Szeredi
@ 2005-07-27 20:30 ` Ram Pai
2005-07-28 8:34 ` Miklos Szeredi
0 siblings, 1 reply; 20+ messages in thread
From: Ram Pai @ 2005-07-27 20:30 UTC (permalink / raw)
To: Miklos Szeredi; +Cc: Avantika Mathur, mike, janak, linux-fsdevel, linux-kernel
On Wed, 2005-07-27 at 12:13, Miklos Szeredi wrote:
> > @@ -54,7 +55,7 @@ static inline unsigned long hash(struct
> >
> > struct vfsmount *alloc_vfsmnt(const char *name)
> > {
> > - struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> > + struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> > if (mnt) {
> > memset(mnt, 0, sizeof(struct vfsmount));
> > atomic_set(&mnt->mnt_count,1);
>
> Please make whitespace changes a separate patch.
I tried to remove trailing whitespaces in the current code
whereever I found them. Ok will them a separate patch.
>
> > @@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
> > {
> > mnt->mnt_parent = mntget(nd->mnt);
> > mnt->mnt_mountpoint = dget(nd->dentry);
> > - list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
> > + mnt->mnt_namespace = nd->mnt->mnt_namespace;
> > + list_add_tail(&mnt->mnt_hash,
> > + mount_hashtable+hash(nd->mnt, nd->dentry));
> > list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
> > nd->dentry->d_mounted++;
> > }
>
> Why list_add_tail()? This changes user visible behavior, and seems
> unnecessary.
Yes. I was about to send out a mail questioning the existing behavior. I
will start a seperate thread questioning the current behavoir. My plan
was to discuss the current behavior before making this change. I thought
I had reverted this change. But it slipped in.
>
> > +static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
> > +{
> > + mnt->mnt_parent = mntget(nd->mnt);
> > + mnt->mnt_mountpoint = dget(nd->dentry);
> > + nd->dentry->d_mounted++;
> > +}
> > +
> > +
>
> You shouldn't add unnecessary newlines. There are a lot of these,
> please audit all your patches.
ok. sure.
>
> > +void do_attach_commit_mnt(struct vfsmount *mnt)
> > +{
> > + struct vfsmount *parent = mnt->mnt_parent;
> > + BUG_ON(parent==mnt);
>
> BUG_ON(parent == mnt);
>
> > + if(list_empty(&mnt->mnt_hash))
>
> if (list_empty(&mnt->mnt_hash))
>
> > + list_add_tail(&mnt->mnt_hash,
> > + mount_hashtable+hash(parent, mnt->mnt_mountpoint));
> > + if(list_empty(&mnt->mnt_child))
> > + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> > + mnt->mnt_namespace = parent->mnt_namespace;
> > + list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
> > +}
>
> Etc. Maybe you should run Lindent on your changes, but be careful not
> to change existing code, even if Lindent would do that!
sure :)
>
> > @@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
> > struct list_head *p;
> > loff_t l = *pos;
> >
> > - down_read(&n->sem);
> > + down_read(&namespace_sem);
> > list_for_each(p, &n->list)
> > if (!l--)
> > return list_entry(p, struct vfsmount, mnt_list);
>
> This should be a separate patch. You can just take the one from the
> detached trees patch-series.
ok. in fact these changes were motivated by that patch.
>
> > +/*
> > + * abort the operations done in attach_recursive_mnt(). run through the mount
> > + * tree, till vfsmount 'last' and undo the changes. Ensure that all the mounts
> > + * in the tree are all back in the mnt_list headed at 'source_mnt'.
> > + * NOTE: This function is closely tied to the logic in
> > + * 'attach_recursive_mnt()'
> > + */
> > +static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
> > + vfsmount *last, struct list_head *head) { struct vfsmount *p =
> > + source_mnt, *m; struct vfspnode *src_pnode;
>
> If you want to do proper error handling, instead of doing rollback, it
> seems better to first do anything that can fail (allocations), then do
> the actual attaching, which cannot fail. It isn't nice to have
> transient states on failure.
yes. it does exactly what you said. In the prepare stage it does not
touch any of the existing vfstree or the pnode tree.
All it does it builds a new vfstree and pnode tree, does the necessary
changes to them. And if everthing is successful, it glues the new tree
to the existing tree (which is the commit phase), and if the prepare
stage fails allocating memory or any other reason, it goes and destroys
the new trees (in the abort phase).
Offcourse in the prepare state, it does increase the reference count of
the vfsmounts to which the new tree will be attached. This is to ensure
that the vfsmounts have not disappeared by the time we reach the commit
phase. I think we are talking the same thing, and the code behaves
exactly as you said.
>
> > + /*
> > + * This operation is equivalent of mount --bind dir dir
> > + * create a new mount at the dentry, and unmount all child mounts
> > + * mounted on top of dentries below 'dentry', and mount them
> > + * under the new mount.
> > + */
> > +struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry *dentry)
>
> Why is this needed? I thought we agreed, that this can be removed.
yes we agreed on returning EINVAL when a directory is attempted to made
shared/private/slave/unclonnable. But this is a different case.
lets say /mnt is a mountpoint having a vfsmount (say A).
now if you run
mount --bind /mnt/a /tmp
this operation succeeds currently.
now lets say /mnt is a mountpoint having a vfsmount which is shared.
now if you run
mount --bind /mnt/a /tmp
we now have a mount at /tmp which gets propogation from mounts under
/mnt/a. right?
but /mnt/a is not a mountpoint at all. if we need to track and
propogate mounts/unmounts under /tmp or /mnt/a we need to have a mount
at /mnt/a. WHich means we have to implicitly make /mnt/a as a
mountpoint. Or the other solution is to return -EINVAL.
But returning -EINVAL is inconsistent with the standard behavior of
mount --bind
the standard behavior allows bind mounts from any directory; need not be
a mountpoint.
We had this exact discussion about the behavior with Mike Waychison, Al
Viro and Bruce Fields. Mike suggested the above implemented behavior.
RP
>
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 3/7] shared subtree
2005-07-27 20:30 ` Ram Pai
@ 2005-07-28 8:34 ` Miklos Szeredi
0 siblings, 0 replies; 20+ messages in thread
From: Miklos Szeredi @ 2005-07-28 8:34 UTC (permalink / raw)
To: linuxram; +Cc: mathurav, mike, janak, linux-fsdevel, linux-kernel
> yes we agreed on returning EINVAL when a directory is attempted to made
> shared/private/slave/unclonnable. But this is a different case.
>
> lets say /mnt is a mountpoint having a vfsmount (say A).
> now if you run
> mount --bind /mnt/a /tmp
> this operation succeeds currently.
>
> now lets say /mnt is a mountpoint having a vfsmount which is shared.
> now if you run
> mount --bind /mnt/a /tmp
>
> we now have a mount at /tmp which gets propogation from mounts under
> /mnt/a. right?
Yes.
> but /mnt/a is not a mountpoint at all. if we need to track and
> propogate mounts/unmounts under /tmp or /mnt/a we need to have a mount
> at /mnt/a.
I don't think we do. You can just check at propagation time if the
propagated mountpoint is visible in the destination mount or not.
Just like --rbind checks whether children mounts are below or above
the to-be-bound directory.
Miklos
^ permalink raw reply [flat|nested] 20+ messages in thread
* (no subject)
2005-07-25 22:44 Ram Pai
` (2 preceding siblings ...)
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-25 22:44 ` Ram Pai
` (3 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 4/7] shared subtree
Content-Type: text/x-patch; name=move.patch
Content-Disposition: inline; filename=move.patch
Adds ability to move a shared/private/slave/unclone tree to any other
shared/private/slave/unclone tree. Also incorporates the same behavior
for pivot_root()
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 196 +++++++++++++++++++++++++++++++++++++++++++-------
include/linux/mount.h | 2
2 files changed, 173 insertions(+), 25 deletions(-)
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -772,9 +772,12 @@ static void abort_attach_recursive_mnt(s
list_del_init(head);
}
+
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
+ * @move : use the move semantics if set, else use normal attach semantics
+ * as explained below
*
* NOTE: in the table below explains the semantics when a source vfsmount
* of a given type is attached to a destination vfsmount of a give type.
@@ -801,12 +804,41 @@ static void abort_attach_recursive_mnt(s
* | | | | | |
* ********************************************************************
*
- * (++) the mount will be propogated to all the vfsmounts in the pnode tree
+ * (++) the mount is propogated to all the vfsmounts in the pnode tree
* of the destination vfsmount, and all the non-slave new mounts in
* destination vfsmount will be added the source vfsmount's pnode.
- * (+) the mount will be propogated to the destination vfsmount
+ * (+) the mount is propogated to the destination vfsmount
* and the new mount will be added to the source vfsmount's pnode.
*
+ * ---------------------------------------------------------------------
+ * | MOVE MOUNT OPERATION |
+ * |*******************************************************************|
+ * | dest --> | shared | private | slave |unclonable |
+ * | source | | | | |
+ * | | | | | | |
+ * | v | | | | |
+ * |*******************************************************************|
+ * | | | | | |
+ * | shared | shared (++) | shared (+)|shared (+)| shared (+)|
+ * | | | | | |
+ * | | | | | |
+ * | private | shared (+) | private | private | private |
+ * | | | | | |
+ * | | | | | |
+ * | slave | shared (+++) | slave | slave | slave |
+ * | | | | | |
+ * | | | | | |
+ * | unclonable| invalid | unclonable |unclonable| unclonable|
+ * | | | | | |
+ * | | | | | |
+ * ********************************************************************
+ *
+ * (+++) the mount is propogated to all the vfsmounts in the pnode tree
+ * of the destination vfsmount, and all the new mounts is
+ * added to a new pnode , which is a slave pnode of the
+ * source vfsmount's pnode.
+ *
+ *
* if the source mount is a tree, the operations explained above is
* applied to each vfsmount in the tree.
*
@@ -815,7 +847,7 @@ static void abort_attach_recursive_mnt(s
*
*/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
- struct nameidata *nd)
+ struct nameidata *nd, int move)
{
struct vfsmount *mntpt_mnt, *last, *m, *p;
struct vfspnode *src_pnode, *dest_pnode, *tmp_pnode;
@@ -849,8 +881,8 @@ static int attach_recursive_mnt(struct v
list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
-
- BUG_ON(IS_MNT_UNCLONE(m));
+ int unclone = IS_MNT_UNCLONE(m);
+ int slave = IS_MNT_SLAVE(m);
while (p && p != m->mnt_parent)
p = p->mnt_parent;
@@ -866,7 +898,7 @@ static int attach_recursive_mnt(struct v
dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
mntpt_mnt->mnt_pnode : NULL;
- src_pnode = (IS_MNT_SHARED(m))?
+ src_pnode = (IS_MNT_SHARED(m) || (move && slave))?
m->mnt_pnode : NULL;
/*
@@ -882,7 +914,7 @@ static int attach_recursive_mnt(struct v
list_del_init(&m->mnt_list);
list_add_tail(&tmp_pnode->pnode_peer_slave, &pnodehead);
- if (dest_pnode) {
+ if (dest_pnode && !unclone) {
if ((ret = pnode_prepare_mount(dest_pnode, tmp_pnode,
mntpt_dentry, m, mntpt_mnt))) {
tmp_pnode->pnode_master = src_pnode;
@@ -890,23 +922,33 @@ static int attach_recursive_mnt(struct v
last = m;
goto error;
}
+ if (move && dest_pnode && slave)
+ SET_PNODE_SLAVE(tmp_pnode);
} else {
if (m == m->mnt_parent)
do_attach_prepare_mnt(mntpt_mnt,
mntpt_dentry, m, 0);
- pnode_add_member_mnt(tmp_pnode, m);
- if (!src_pnode) {
- set_mnt_private(m);
+ if (move && slave)
+ pnode_add_slave_mnt(tmp_pnode, m);
+ else {
+ pnode_add_member_mnt(tmp_pnode, m);
+ if (unclone) {
+ BUG_ON(!move);
+ set_mnt_unclone(m);
+ m->mnt_pnode = tmp_pnode;
+ SET_PNODE_DELETE(tmp_pnode);
+ } else if (!src_pnode) {
+ set_mnt_private(m);
+ m->mnt_pnode = tmp_pnode;
+ SET_PNODE_DELETE(tmp_pnode);
+ }
/*
- * NOTE: set_mnt_private()
- * resets m->mnt_pnode.
- * Reinitialize it. This is needed to
- * decrement the refcount on the
- * pnode when the mount 'm' is
- * unlinked in pnode_commit_mount().
+ * NOTE: set_mnt_private() & set_mnt_unclone()
+ * resets m->mnt_pnode. Hence reinitialize it.
+ * We need this to decrement the refcount
+ * on the pnode when the mount 'm' is
+ * unlinked in pnode_commit_mount()
*/
- m->mnt_pnode = tmp_pnode;
- SET_PNODE_DELETE(tmp_pnode);
}
}
@@ -931,6 +973,46 @@ error:
return 1;
}
+static void
+detach_recursive_mnt(struct vfsmount *source_mnt, struct nameidata *nd)
+{
+ struct vfsmount *m;
+
+ detach_mnt(source_mnt, nd);
+ spin_lock(&vfspnode_lock);
+ for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
+ list_del_init(&m->mnt_pnode_mntlist);
+ list_del_init(&m->mnt_list);
+ if (m != source_mnt)
+ list_add_tail(&m->mnt_list, &source_mnt->mnt_list);
+ }
+ spin_unlock(&vfspnode_lock);
+}
+
+static void
+undo_detach_recursive_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+ struct vfsmount *m;
+ LIST_HEAD(head);
+
+ spin_lock(&vfspnode_lock);
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ if (m->mnt_pnode) {
+ if (IS_MNT_SHARED(m))
+ list_add(&m->mnt_pnode_mntlist,
+ &m->mnt_pnode->pnode_vfs);
+ if (IS_MNT_SLAVE(m))
+ list_add(&m->mnt_pnode_mntlist,
+ &m->mnt_pnode->pnode_slavevfs);
+ }
+ }
+ attach_mnt(mnt, nd);
+ spin_unlock(&vfspnode_lock);
+
+ list_add_tail(&head, &mnt->mnt_list);
+ list_splice(&head, nd->mnt->mnt_namespace->list.prev);
+}
+
static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
int err, ret;
@@ -957,7 +1039,7 @@ static int graft_tree(struct vfsmount *m
ret = (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry));
spin_unlock(&vfsmount_lock);
if (ret)
- err = attach_recursive_mnt(mnt, nd);
+ err = attach_recursive_mnt(mnt, nd, 0);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
@@ -1311,6 +1393,19 @@ static int do_remount(struct nameidata *
return err;
}
+/*
+ * return 1 if the mount tree contains a unclonable mount
+ */
+static inline int tree_contains_unclone(struct vfsmount *mnt)
+{
+ struct vfsmount *p;
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ if (IS_MNT_UNCLONE(p))
+ return 1;
+ }
+ return 0;
+}
+
static int do_move_mount(struct nameidata *nd, char *old_name)
{
struct nameidata old_nd, parent_nd;
@@ -1351,14 +1446,35 @@ static int do_move_mount(struct nameidat
S_ISDIR(old_nd.dentry->d_inode->i_mode))
goto out2;
+ /*
+ * Don't move a mount in a shared parent.
+ */
+ if (old_nd.mnt->mnt_parent &&
+ IS_MNT_SHARED(old_nd.mnt->mnt_parent))
+ goto out2;
+
+ /*
+ * Don't move a mount tree having unclonable
+ * mounts, under a shared mount
+ */
+ if (IS_MNT_SHARED(nd->mnt) &&
+ tree_contains_unclone(old_nd.mnt))
+ goto out2;
+
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
if (p == old_nd.mnt)
goto out2;
err = 0;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_recursive_mnt(old_nd.mnt, &parent_nd);
+ spin_unlock(&vfsmount_lock);
+ if ((err = attach_recursive_mnt(old_nd.mnt, nd, 1))) {
+ undo_detach_recursive_mnt(old_nd.mnt, &parent_nd);
+ goto out1;
+ }
+ spin_lock(&vfsmount_lock);
+ mntput(old_nd.mnt);
/* if the mount is moved, it should no longer be expire
* automatically */
@@ -1949,6 +2065,16 @@ asmlinkage long sys_pivot_root(const cha
goto out2; /* not a mountpoint */
if (new_nd.mnt->mnt_root != new_nd.dentry)
goto out2; /* not a mountpoint */
+ /*
+ * Don't move a mount in a shared parent.
+ */
+ if(user_nd.mnt->mnt_parent &&
+ IS_MNT_SHARED(user_nd.mnt->mnt_parent))
+ goto out2;
+ if(new_nd.mnt->mnt_parent &&
+ IS_MNT_SHARED(new_nd.mnt->mnt_parent))
+ goto out2;
+
tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
spin_lock(&vfsmount_lock);
if (tmp != new_nd.mnt) {
@@ -1963,10 +2089,30 @@ asmlinkage long sys_pivot_root(const cha
goto out3;
} else if (!is_subdir(old_nd.dentry, new_nd.dentry))
goto out3;
- detach_mnt(new_nd.mnt, &parent_nd);
- detach_mnt(user_nd.mnt, &root_parent);
- attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */
- attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
+
+ detach_recursive_mnt(user_nd.mnt, &root_parent);
+ detach_recursive_mnt(new_nd.mnt, &parent_nd);
+
+ spin_unlock(&vfsmount_lock);
+ if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
+ spin_lock(&vfsmount_lock);
+ undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
+ undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
+ goto out3;
+ }
+ spin_lock(&vfsmount_lock);
+ mntput(user_nd.mnt);
+
+ spin_unlock(&vfsmount_lock);
+ if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
+ spin_lock(&vfsmount_lock);
+ undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
+ undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
+ goto out3;
+ }
+ spin_lock(&vfsmount_lock);
+ mntput(new_nd.mnt);
+
spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd);
Index: 2.6.12.work2/include/linux/mount.h
===================================================================
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -29,6 +29,8 @@
#define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
#define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
#define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
+#define GET_MNT_TYPE(mnt) (mnt->mnt_flags & MNT_PNODE_MASK)
+#define SET_MNT_TYPE(mnt, type) (mnt->mnt_flags |= (type & MNT_PNODE_MASK))
#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
#define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
^ permalink raw reply [flat|nested] 20+ messages in thread* (no subject)
2005-07-25 22:44 Ram Pai
` (3 preceding siblings ...)
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-25 22:44 ` Ram Pai
` (2 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 5/7] shared subtree
Content-Type: text/x-patch; name=umount.patch
Content-Disposition: inline; filename=umount.patch
Adds ability to unmount a shared/slave/unclone/private tree
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 76 ++++++++++++++++++++++++++++++++++++++++----------
fs/pnode.c | 66 +++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 3 +
include/linux/pnode.h | 9 ++++-
4 files changed, 138 insertions(+), 16 deletions(-)
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -666,3 +666,69 @@ int pnode_abort_mount(struct vfspnode *p
NULL, (void *)NULL, NULL, NULL,
vfs_abort_mount_func, exception_mnt);
}
+
+static int vfs_busy(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ struct dentry *dentry = va_arg(args, struct dentry *);
+ struct dentry *rootdentry = va_arg(args, struct dentry *);
+ struct vfsmount *origmnt = va_arg(args, struct vfsmount *);
+ struct vfsmount *child_mnt;
+ int ret=0;
+
+ spin_unlock(&vfsmount_lock);
+ child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+ spin_lock(&vfsmount_lock);
+
+ if (!child_mnt)
+ return 0;
+
+ if (list_empty(&child_mnt->mnt_mounts)) {
+ if (origmnt == child_mnt)
+ ret = do_refcount_check(child_mnt, 3);
+ else
+ ret = do_refcount_check(child_mnt, 2);
+ }
+ mntput(child_mnt);
+ return ret;
+}
+
+int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
+ struct dentry *root, struct vfsmount *mnt)
+{
+ return pnode_traverse(pnode, NULL, NULL,
+ NULL, NULL, vfs_busy, mntpt, root, mnt);
+}
+
+
+int vfs_umount(struct vfsmount *mnt, enum pnode_vfs_type flag,
+ void *indata, va_list args)
+{
+ struct vfsmount *child_mnt;
+ struct dentry *dentry, *rootdentry;
+
+
+ dentry = va_arg(args, struct dentry *);
+ rootdentry = va_arg(args, struct dentry *);
+
+ spin_unlock(&vfsmount_lock);
+ child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+ spin_lock(&vfsmount_lock);
+ mntput(child_mnt);
+ if (child_mnt && list_empty(&child_mnt->mnt_mounts)) {
+ if (IS_MNT_SHARED(child_mnt) ||
+ IS_MNT_SLAVE(child_mnt)) {
+ BUG_ON(!child_mnt->mnt_pnode);
+ pnode_disassociate_mnt(child_mnt);
+ }
+ do_detach_mount(child_mnt);
+ }
+ return 0;
+}
+
+int pnode_umount(struct vfspnode *pnode, struct dentry *dentry,
+ struct dentry *rootdentry)
+{
+ return pnode_traverse(pnode, NULL, (void *)NULL,
+ NULL, NULL, vfs_umount, dentry, rootdentry);
+}
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -395,6 +395,20 @@ resume:
EXPORT_SYMBOL(may_umount_tree);
+int mount_busy(struct vfsmount *mnt)
+{
+ struct vfspnode *parent_pnode;
+
+ if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
+ return do_refcount_check(mnt, 2);
+
+ parent_pnode = mnt->mnt_parent->mnt_pnode;
+ BUG_ON(!parent_pnode);
+ return pnode_mount_busy(parent_pnode,
+ mnt->mnt_mountpoint,
+ mnt->mnt_root, mnt);
+}
+
/**
* may_umount - check if a mount point is busy
* @mnt: root of mount
@@ -410,14 +424,28 @@ EXPORT_SYMBOL(may_umount_tree);
*/
int may_umount(struct vfsmount *mnt)
{
- if (atomic_read(&mnt->mnt_count) > 2)
+ if (mount_busy(mnt))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL(may_umount);
-void umount_tree(struct vfsmount *mnt)
+void do_detach_mount(struct vfsmount *mnt)
+{
+ struct nameidata old_nd;
+ if (mnt != mnt->mnt_parent) {
+ detach_mnt(mnt, &old_nd);
+ path_release(&old_nd);
+ }
+ list_del_init(&mnt->mnt_list);
+ list_del_init(&mnt->mnt_fslink);
+ spin_unlock(&vfsmount_lock);
+ mntput(mnt);
+ spin_lock(&vfsmount_lock);
+}
+
+void __umount_tree(struct vfsmount *mnt, int propogate)
{
struct vfsmount *p;
LIST_HEAD(kill);
@@ -431,20 +459,40 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
list_del_init(&mnt->mnt_fslink);
- if (mnt->mnt_parent == mnt) {
- spin_unlock(&vfsmount_lock);
+ if (propogate && mnt->mnt_parent != mnt &&
+ IS_MNT_SHARED(mnt->mnt_parent)) {
+ struct vfspnode *parent_pnode
+ = mnt->mnt_parent->mnt_pnode;
+ BUG_ON(!parent_pnode);
+ pnode_umount(parent_pnode,
+ mnt->mnt_mountpoint,
+ mnt->mnt_root);
} else {
- struct nameidata old_nd;
- detach_mnt(mnt, &old_nd);
- spin_unlock(&vfsmount_lock);
- path_release(&old_nd);
+ if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+ BUG_ON(!mnt->mnt_pnode);
+ pnode_disassociate_mnt(mnt);
+ }
+ do_detach_mount(mnt);
}
- mntput(mnt);
- spin_lock(&vfsmount_lock);
}
}
-static int do_umount(struct vfsmount *mnt, int flags)
+void umount_tree(struct vfsmount *mnt)
+{
+ __umount_tree(mnt, 1);
+}
+
+/*
+ * return true if the refcount is greater than count
+ */
+int do_refcount_check(struct vfsmount *mnt, int count)
+{
+
+ int mycount = atomic_read(&mnt->mnt_count);
+ return (mycount > count);
+}
+
+int do_umount(struct vfsmount *mnt, int flags)
{
struct super_block * sb = mnt->mnt_sb;
int retval;
@@ -525,7 +573,7 @@ static int do_umount(struct vfsmount *mn
spin_lock(&vfsmount_lock);
}
retval = -EBUSY;
- if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
+ if (flags & MNT_DETACH || !mount_busy(mnt)) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt);
retval = 0;
@@ -659,7 +707,7 @@ static struct vfsmount *copy_tree(struct
Enomem:
if (res) {
spin_lock(&vfsmount_lock);
- umount_tree(res);
+ __umount_tree(res, 0);
spin_unlock(&vfsmount_lock);
}
return NULL;
@@ -1341,7 +1389,7 @@ static int do_loopback(struct nameidata
err = graft_tree(mnt, nd);
if (err) {
spin_lock(&vfsmount_lock);
- umount_tree(mnt);
+ __umount_tree(mnt, 0);
spin_unlock(&vfsmount_lock);
/*
* ok we failed! so undo any overlay
Index: 2.6.12.work2/include/linux/fs.h
===================================================================
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -1216,12 +1216,15 @@ extern struct vfsmount *kern_mount(struc
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern int do_umount(struct vfsmount *, int);
extern struct vfsmount *do_attach_prepare_mnt(struct vfsmount *,
struct dentry *, struct vfsmount *, int);
extern void do_attach_commit_mnt(struct vfsmount *);
extern struct vfsmount *do_make_mounted(struct vfsmount *, struct dentry *);
extern int do_make_unmounted(struct vfsmount *);
extern void do_detach_prepare_mnt(struct vfsmount *, int);
+extern void do_detach_mount(struct vfsmount *);
+extern int do_refcount_check(struct vfsmount *, int );
extern int vfs_statfs(struct super_block *, struct kstatfs *);
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- 2.6.12.work2.orig/include/linux/pnode.h
+++ 2.6.12.work2/include/linux/pnode.h
@@ -63,13 +63,15 @@ put_pnode_locked(struct vfspnode *pnode)
{
if (!pnode)
return;
- if (atomic_dec_and_test(&pnode->pnode_count)) {
+ if (atomic_dec_and_test(&pnode->pnode_count))
__put_pnode(pnode);
- }
}
void __init pnode_init(unsigned long );
struct vfspnode * pnode_alloc(void);
+void pnode_free(struct vfspnode *);
+int pnode_is_busy(struct vfspnode *);
+int pnode_umount_vfs(struct vfspnode *, struct dentry *, struct dentry *, int);
void pnode_add_slave_mnt(struct vfspnode *, struct vfsmount *);
void pnode_add_member_mnt(struct vfspnode *, struct vfsmount *);
void pnode_del_slave_mnt(struct vfsmount *);
@@ -87,4 +89,7 @@ int pnode_prepare_mount(struct vfspnode
struct vfsmount *, struct vfsmount *);
int pnode_commit_mount(struct vfspnode *, int);
int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
+int pnode_umount(struct vfspnode *, struct dentry *, struct dentry *);
+int pnode_mount_busy(struct vfspnode *, struct dentry *, struct dentry *,
+ struct vfsmount *);
#endif /* _LINUX_PNODE_H */
^ permalink raw reply [flat|nested] 20+ messages in thread* (no subject)
2005-07-25 22:44 Ram Pai
` (4 preceding siblings ...)
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-25 22:44 ` Ram Pai
2005-07-26 2:53 ` supposed to be shared subtree patches Ram Pai
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 6/7] shared subtree
Content-Type: text/x-patch; name=namespace.patch
Content-Disposition: inline; filename=namespace.patch
Adds ability to clone a namespace that has shared/private/slave/unclone
subtrees in it.
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 9 +++++++++
1 files changed, 9 insertions(+)
Index: 2.6.12-rc6.work1/fs/namespace.c
===================================================================
--- 2.6.12-rc6.work1.orig/fs/namespace.c
+++ 2.6.12-rc6.work1/fs/namespace.c
@@ -1894,6 +1894,13 @@ int copy_namespace(int flags, struct tas
q = new_ns->root;
while (p) {
q->mnt_namespace = new_ns;
+
+ if (IS_MNT_SHARED(q))
+ pnode_add_member_mnt(q->mnt_pnode, q);
+ else if (IS_MNT_SLAVE(q))
+ pnode_add_slave_mnt(q->mnt_pnode, q);
+ put_pnode(q->mnt_pnode);
+
if (fs) {
if (p == fs->rootmnt) {
rootmnt = p;
@@ -2271,6 +2278,8 @@ void __put_namespace(struct namespace *n
spin_lock(&vfsmount_lock);
list_for_each_entry(mnt, &namespace->list, mnt_list) {
+ if (mnt->mnt_pnode)
+ pnode_disassociate_mnt(mnt);
mnt->mnt_namespace = NULL;
}
^ permalink raw reply [flat|nested] 20+ messages in thread* (no subject)
2005-07-25 22:44 Ram Pai
` (5 preceding siblings ...)
2005-07-25 22:44 ` Ram Pai
@ 2005-07-25 22:44 ` Ram Pai
2005-07-26 2:53 ` supposed to be shared subtree patches Ram Pai
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-25 22:44 UTC (permalink / raw)
To: akpm, Al Viro; +Cc: Avantika Mathur, Mike Waychison
, miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 7/7] shared subtree
Content-Type: text/x-patch; name=automount.patch
Content-Disposition: inline; filename=automount.patch
adds support for mount/umount propogation for autofs initiated operations,
RP
Signed by Ram Pai (linuxram@us.ibm.com)
fs/namespace.c | 176 +++++++++++++++++++-------------------------------
fs/pnode.c | 12 +--
include/linux/pnode.h | 3
3 files changed, 76 insertions(+), 115 deletions(-)
Index: 2.6.12.work2/fs/namespace.c
===================================================================
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -202,6 +202,9 @@ struct vfsmount *do_attach_prepare_mnt(s
if(!(child_mnt = clone_mnt(template_mnt,
template_mnt->mnt_root)))
return NULL;
+ spin_lock(&vfsmount_lock);
+ list_del_init(&child_mnt->mnt_fslink);
+ spin_unlock(&vfsmount_lock);
} else
child_mnt = template_mnt;
@@ -355,35 +358,14 @@ struct seq_operations mounts_op = {
*/
int may_umount_tree(struct vfsmount *mnt)
{
- struct list_head *next;
- struct vfsmount *this_parent = mnt;
- int actual_refs;
- int minimum_refs;
+ int actual_refs=0;
+ int minimum_refs=0;
+ struct vfsmount *p;
spin_lock(&vfsmount_lock);
- actual_refs = atomic_read(&mnt->mnt_count);
- minimum_refs = 2;
-repeat:
- next = this_parent->mnt_mounts.next;
-resume:
- while (next != &this_parent->mnt_mounts) {
- struct vfsmount *p = list_entry(next, struct vfsmount, mnt_child);
-
- next = next->next;
-
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += atomic_read(&p->mnt_count);
minimum_refs += 2;
-
- if (!list_empty(&p->mnt_mounts)) {
- this_parent = p;
- goto repeat;
- }
- }
-
- if (this_parent != mnt) {
- next = this_parent->mnt_child.next;
- this_parent = this_parent->mnt_parent;
- goto resume;
}
spin_unlock(&vfsmount_lock);
@@ -395,18 +377,18 @@ resume:
EXPORT_SYMBOL(may_umount_tree);
-int mount_busy(struct vfsmount *mnt)
+int mount_busy(struct vfsmount *mnt, int refcnt)
{
struct vfspnode *parent_pnode;
if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
- return do_refcount_check(mnt, 2);
+ return do_refcount_check(mnt, refcnt);
parent_pnode = mnt->mnt_parent->mnt_pnode;
BUG_ON(!parent_pnode);
return pnode_mount_busy(parent_pnode,
mnt->mnt_mountpoint,
- mnt->mnt_root, mnt);
+ mnt->mnt_root, mnt, refcnt);
}
/**
@@ -424,9 +406,12 @@ int mount_busy(struct vfsmount *mnt)
*/
int may_umount(struct vfsmount *mnt)
{
- if (mount_busy(mnt))
- return -EBUSY;
- return 0;
+ int ret=0;
+ spin_lock(&vfsmount_lock);
+ if (mount_busy(mnt, 2))
+ ret = -EBUSY;
+ spin_unlock(&vfsmount_lock);
+ return ret;
}
EXPORT_SYMBOL(may_umount);
@@ -445,7 +430,26 @@ void do_detach_mount(struct vfsmount *mn
spin_lock(&vfsmount_lock);
}
-void __umount_tree(struct vfsmount *mnt, int propogate)
+void umount_mnt(struct vfsmount *mnt, int propogate)
+{
+ if (propogate && mnt->mnt_parent != mnt &&
+ IS_MNT_SHARED(mnt->mnt_parent)) {
+ struct vfspnode *parent_pnode
+ = mnt->mnt_parent->mnt_pnode;
+ BUG_ON(!parent_pnode);
+ pnode_umount(parent_pnode,
+ mnt->mnt_mountpoint,
+ mnt->mnt_root);
+ } else {
+ if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+ BUG_ON(!mnt->mnt_pnode);
+ pnode_disassociate_mnt(mnt);
+ }
+ do_detach_mount(mnt);
+ }
+}
+
+static void __umount_tree(struct vfsmount *mnt, int propogate)
{
struct vfsmount *p;
LIST_HEAD(kill);
@@ -459,21 +463,7 @@ void __umount_tree(struct vfsmount *mnt,
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
list_del_init(&mnt->mnt_fslink);
- if (propogate && mnt->mnt_parent != mnt &&
- IS_MNT_SHARED(mnt->mnt_parent)) {
- struct vfspnode *parent_pnode
- = mnt->mnt_parent->mnt_pnode;
- BUG_ON(!parent_pnode);
- pnode_umount(parent_pnode,
- mnt->mnt_mountpoint,
- mnt->mnt_root);
- } else {
- if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
- BUG_ON(!mnt->mnt_pnode);
- pnode_disassociate_mnt(mnt);
- }
- do_detach_mount(mnt);
- }
+ umount_mnt(mnt, propogate);
}
}
@@ -573,7 +563,7 @@ int do_umount(struct vfsmount *mnt, int
spin_lock(&vfsmount_lock);
}
retval = -EBUSY;
- if (flags & MNT_DETACH || !mount_busy(mnt)) {
+ if (flags & MNT_DETACH || !mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt);
retval = 0;
@@ -755,8 +745,11 @@ static void commit_attach_recursive_mnt(
if (slave_flag)
pnode_add_slave_pnode(master_pnode, tmp_pnode);
- else
+ else {
+ spin_lock(&vfspnode_lock);
pnode_merge_pnode(tmp_pnode, master_pnode);
+ spin_unlock(&vfspnode_lock);
+ }
/*
* we don't need the extra reference to
@@ -820,7 +813,6 @@ static void abort_attach_recursive_mnt(s
list_del_init(head);
}
-
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
@@ -1518,8 +1510,9 @@ static int do_move_mount(struct nameidat
detach_recursive_mnt(old_nd.mnt, &parent_nd);
spin_unlock(&vfsmount_lock);
if ((err = attach_recursive_mnt(old_nd.mnt, nd, 1))) {
+ spin_lock(&vfsmount_lock);
undo_detach_recursive_mnt(old_nd.mnt, &parent_nd);
- goto out1;
+ goto out2;
}
spin_lock(&vfsmount_lock);
mntput(old_nd.mnt);
@@ -1621,6 +1614,8 @@ void mark_mounts_for_expiry(struct list_
if (list_empty(mounts))
return;
+ down_write(&namespace_sem);
+
spin_lock(&vfsmount_lock);
/* extract from the expiration list every vfsmount that matches the
@@ -1630,8 +1625,7 @@ void mark_mounts_for_expiry(struct list_
* cleared by mntput())
*/
list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) {
- if (!xchg(&mnt->mnt_expiry_mark, 1) ||
- atomic_read(&mnt->mnt_count) != 1)
+ if (!xchg(&mnt->mnt_expiry_mark, 1) || mount_busy(mnt, 1))
continue;
mntget(mnt);
@@ -1639,12 +1633,13 @@ void mark_mounts_for_expiry(struct list_
}
/*
- * go through the vfsmounts we've just consigned to the graveyard to
- * - check that they're still dead
+ * go through the vfsmounts we've just consigned to the graveyard
* - delete the vfsmount from the appropriate namespace under lock
* - dispose of the corpse
*/
while (!list_empty(&graveyard)) {
+ struct super_block *sb;
+
mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
list_del_init(&mnt->mnt_fslink);
@@ -1655,60 +1650,25 @@ void mark_mounts_for_expiry(struct list_
continue;
get_namespace(namespace);
- spin_unlock(&vfsmount_lock);
- down_write(&namespace_sem);
- spin_lock(&vfsmount_lock);
-
- /* check that it is still dead: the count should now be 2 - as
- * contributed by the vfsmount parent and the mntget above */
- if (atomic_read(&mnt->mnt_count) == 2) {
- struct vfsmount *xdmnt;
- struct dentry *xdentry;
-
- /* delete from the namespace */
- list_del_init(&mnt->mnt_list);
- list_del_init(&mnt->mnt_child);
- list_del_init(&mnt->mnt_hash);
- mnt->mnt_mountpoint->d_mounted--;
-
- xdentry = mnt->mnt_mountpoint;
- mnt->mnt_mountpoint = mnt->mnt_root;
- xdmnt = mnt->mnt_parent;
- mnt->mnt_parent = mnt;
-
- spin_unlock(&vfsmount_lock);
-
- mntput(xdmnt);
- dput(xdentry);
-
- /* now lay it to rest if this was the last ref on the
- * superblock */
- if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
- /* last instance - try to be smart */
- lock_kernel();
- DQUOT_OFF(mnt->mnt_sb);
- acct_auto_close(mnt->mnt_sb);
- unlock_kernel();
- }
-
- mntput(mnt);
- } else {
- /* someone brought it back to life whilst we didn't
- * have any locks held so return it to the expiration
- * list */
- list_add_tail(&mnt->mnt_fslink, mounts);
- spin_unlock(&vfsmount_lock);
+ sb = mnt->mnt_sb;
+ umount_mnt(mnt, 1);
+ /*
+ * now lay it to rest if this was the last ref on the
+ * superblock
+ */
+ if (atomic_read(&sb->s_active) == 1) {
+ /* last instance - try to be smart */
+ lock_kernel();
+ DQUOT_OFF(sb);
+ acct_auto_close(sb);
+ unlock_kernel();
}
-
- up_write(&namespace_sem);
-
mntput(mnt);
- put_namespace(namespace);
- spin_lock(&vfsmount_lock);
+ put_namespace(namespace);
}
-
spin_unlock(&vfsmount_lock);
+ up_write(&namespace_sem);
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -2149,24 +2109,24 @@ asmlinkage long sys_pivot_root(const cha
detach_recursive_mnt(new_nd.mnt, &parent_nd);
spin_unlock(&vfsmount_lock);
- if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
+ if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
spin_lock(&vfsmount_lock);
undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
goto out3;
}
spin_lock(&vfsmount_lock);
- mntput(user_nd.mnt);
+ mntput(new_nd.mnt);
spin_unlock(&vfsmount_lock);
- if ((error = attach_recursive_mnt(new_nd.mnt, &root_parent, 1))) {
+ if ((error = attach_recursive_mnt(user_nd.mnt, &old_nd, 1))) {
spin_lock(&vfsmount_lock);
undo_detach_recursive_mnt(new_nd.mnt, &parent_nd);
undo_detach_recursive_mnt(user_nd.mnt, &root_parent);
goto out3;
}
spin_lock(&vfsmount_lock);
- mntput(new_nd.mnt);
+ mntput(user_nd.mnt);
spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
Index: 2.6.12.work2/fs/pnode.c
===================================================================
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -29,7 +29,7 @@
static kmem_cache_t * pnode_cachep;
/* spinlock for pnode related operations */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
enum pnode_vfs_type {
PNODE_MEMBER_VFS = 0x01,
@@ -673,6 +673,7 @@ static int vfs_busy(struct vfsmount *mnt
struct dentry *dentry = va_arg(args, struct dentry *);
struct dentry *rootdentry = va_arg(args, struct dentry *);
struct vfsmount *origmnt = va_arg(args, struct vfsmount *);
+ int refcnt = va_arg(args, int);
struct vfsmount *child_mnt;
int ret=0;
@@ -685,22 +686,21 @@ static int vfs_busy(struct vfsmount *mnt
if (list_empty(&child_mnt->mnt_mounts)) {
if (origmnt == child_mnt)
- ret = do_refcount_check(child_mnt, 3);
+ ret = do_refcount_check(child_mnt, refcnt+1);
else
- ret = do_refcount_check(child_mnt, 2);
+ ret = do_refcount_check(child_mnt, refcnt);
}
mntput(child_mnt);
return ret;
}
int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
- struct dentry *root, struct vfsmount *mnt)
+ struct dentry *root, struct vfsmount *mnt, int refcnt)
{
return pnode_traverse(pnode, NULL, NULL,
- NULL, NULL, vfs_busy, mntpt, root, mnt);
+ NULL, NULL, vfs_busy, mntpt, root, mnt, refcnt);
}
-
int vfs_umount(struct vfsmount *mnt, enum pnode_vfs_type flag,
void *indata, va_list args)
{
Index: 2.6.12.work2/include/linux/pnode.h
===================================================================
--- 2.6.12.work2.orig/include/linux/pnode.h
+++ 2.6.12.work2/include/linux/pnode.h
@@ -77,6 +77,7 @@ void pnode_add_member_mnt(struct vfspnod
void pnode_del_slave_mnt(struct vfsmount *);
void pnode_del_member_mnt(struct vfsmount *);
void pnode_disassociate_mnt(struct vfsmount *);
+void pnode_member_to_slave(struct vfsmount *);
void pnode_add_slave_pnode(struct vfspnode *, struct vfspnode *);
struct vfsmount * pnode_make_mounted(struct vfspnode *, struct vfsmount *,
struct dentry *);
@@ -91,5 +92,5 @@ int pnode_commit_mount(struct vfspnode *
int pnode_abort_mount(struct vfspnode *, struct vfsmount *);
int pnode_umount(struct vfspnode *, struct dentry *, struct dentry *);
int pnode_mount_busy(struct vfspnode *, struct dentry *, struct dentry *,
- struct vfsmount *);
+ struct vfsmount *, int);
#endif /* _LINUX_PNODE_H */
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: supposed to be shared subtree patches.
2005-07-25 22:44 Ram Pai
` (6 preceding siblings ...)
2005-07-25 22:44 ` Ram Pai
@ 2005-07-26 2:53 ` Ram Pai
7 siblings, 0 replies; 20+ messages in thread
From: Ram Pai @ 2005-07-26 2:53 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel
Cc: akpm, Al Viro, Avantika Mathur, Mike Waychison
On Mon, 2005-07-25 at 15:44, Ram Pai wrote:
> , miklos@szeredi.hu, Janak Desai <janak@us.ibm.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
> Subject: [PATCH 0/7] shared subtree
>
> Hi Andrew/Al Viro,
>
> Enclosing a final set of well tested patches that implement
....my apologies. I screwed up sending the patches through quilt.
anyway I have received the following comments from Andrew Morton, which
I will incorporate before sending out saner looking patches.
sorry again,
RP
Andrew's comments follows:
----------------------------------------
Frankly, I don't even know what these patches _do_, and haven't spent
the time to try to find out.
If these patches are merged, how do we expect end-users to find out how
to use the new capabilities?
A few paragraphs in the patch #1 changelog would help. A high-level
description of the new capability which explains what it does and why it
would be a useful thing for Linux.
And maybe some deeper information in a Documentation/ file.
Right now, there might well be a lot of people who could use these new
features, but they don't even know that these patches provide them!
It's all a bit of a mystery, really.
---------------------------------------------------------------------
^ permalink raw reply [flat|nested] 20+ messages in thread