[PATCH] private mounts

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] private mounts
@ 2005-04-24 20:08 Miklos Szeredi
  2005-04-24 20:13 ` Al Viro
  2005-04-24 20:18 ` Christoph Hellwig
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 20:08 UTC (permalink / raw)
  To: linux-fsdevel, hch; +Cc: linux-kernel, akpm

This simple patch adds support for private (or invisible) mounts.  The
rationale is to allow mounts to be private for a user but still in the
global namespace.

An immediate user of this would be FUSE, which currently achieves the
hiding of data with inode->permission(), which is less elegant.

Christoph, I'm specially interested in your opinion, since you were so
strongly opposed to the current solution in FUSE.

Performance measurements indicate that the overhead is about 2% of the
time spent following mounts, or 6ns per-mount on a 533 Celeron.

This patch does:

 - add new mount flag: MS_PRIVATE / MNT_PRIVATE
 - add new member in struct vfsmount: mnt_uid
 - if MNT_PRIVATE is set, set mnt_uid to current->fsuid in
   do_add_mount() and do_remount()
 - in clone_mnt() copy mnt_uid to the new mount
 - in lookup_mnt() while looping through the hash chain for the
   mountpoint, check if the mount is "visible" for this process, and
   skip it if not

Comments are appreciated.  If there are no vetoes agains the patch, I
think it's suitable for -mm.

Thanks,
Miklos

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>

diff -rup orig/linux-2.6.11/fs/namespace.c linux-2.6.11/fs/namespace.c
--- orig/linux-2.6.11/fs/namespace.c	2005-03-04 23:18:48.000000000 +0100
+++ linux-2.6.11/fs/namespace.c	2005-04-24 12:44:41.000000000 +0200
@@ -81,6 +81,15 @@ void free_vfsmnt(struct vfsmount *mnt)
 }
 
 /*
+ * Check if this mount should be skipped or not
+ */
+static inline int mnt_visible(struct vfsmount *mnt)
+{
+	return !(mnt->mnt_flags & MNT_PRIVATE) ||
+		mnt->mnt_uid == current->fsuid;
+}
+
+/*
  * Now, lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
@@ -97,7 +106,8 @@ struct vfsmount *lookup_mnt(struct vfsmo
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
-		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry &&
+		    mnt_visible(p)) {
 			found = mntget(p);
 			break;
 		}
@@ -155,6 +165,7 @@ clone_mnt(struct vfsmount *old, struct d
 
 	if (mnt) {
 		mnt->mnt_flags = old->mnt_flags;
+		mnt->mnt_uid = old->mnt_uid;
 		atomic_inc(&sb->s_active);
 		mnt->mnt_sb = sb;
 		mnt->mnt_root = dget(root);
@@ -234,6 +245,7 @@ static int show_vfsmnt(struct seq_file *
 		{ MNT_NOSUID, ",nosuid" },
 		{ MNT_NODEV, ",nodev" },
 		{ MNT_NOEXEC, ",noexec" },
+		{ MNT_PRIVATE, ",private" },
 		{ 0, NULL }
 	};
 	struct proc_fs_info *fs_infop;
@@ -252,6 +264,8 @@ static int show_vfsmnt(struct seq_file *
 		if (mnt->mnt_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+	if (mnt->mnt_flags & MNT_PRIVATE)
+		seq_printf(m, ",mnt_uid=%u", mnt->mnt_uid);
 	if (mnt->mnt_sb->s_op->show_options)
 		err = mnt->mnt_sb->s_op->show_options(m, mnt);
 	seq_puts(m, " 0 0\n");
@@ -684,8 +698,11 @@ static int do_remount(struct nameidata *
 
 	down_write(&sb->s_umount);
 	err = do_remount_sb(sb, flags, data, 0);
-	if (!err)
+	if (!err) {
 		nd->mnt->mnt_flags=mnt_flags;
+		if (mnt_flags & MNT_PRIVATE)
+			nd->mnt->mnt_uid = current->fsuid;
+	}
 	up_write(&sb->s_umount);
 	if (!err)
 		security_sb_post_remount(nd->mnt, flags, data);
@@ -807,6 +824,8 @@ int do_add_mount(struct vfsmount *newmnt
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
+	if (mnt_flags & MNT_PRIVATE)
+		newmnt->mnt_uid = current->fsuid;
 	err = graft_tree(newmnt, nd);
 
 	if (err == 0 && fslist) {
@@ -1033,7 +1052,9 @@ long do_mount(char * dev_name, char * di
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
-	flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+	if (flags & MS_PRIVATE)
+		mnt_flags |= MNT_PRIVATE;
+	flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_PRIVATE|MS_ACTIVE);
 
 	/* ... and get the mountpoint */
 	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
diff -rup orig/linux-2.6.11/include/linux/fs.h linux-2.6.11/include/linux/fs.h
--- orig/linux-2.6.11/include/linux/fs.h	2005-03-04 23:19:05.000000000 +0100
+++ linux-2.6.11/include/linux/fs.h	2005-04-24 10:23:33.000000000 +0200
@@ -96,6 +96,7 @@ extern int dir_notify_enable;
 #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
 #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
 #define MS_DIRSYNC	128	/* Directory modifications are synchronous */
+#define MS_PRIVATE	256	/* Make this mount invisible to other users */
 #define MS_NOATIME	1024	/* Do not update access times. */
 #define MS_NODIRATIME	2048	/* Do not update directory access times */
 #define MS_BIND		4096
diff -rup orig/linux-2.6.11/include/linux/mount.h linux-2.6.11/include/linux/mount.h
--- orig/linux-2.6.11/include/linux/mount.h	2004-12-25 11:52:55.000000000 +0100
+++ linux-2.6.11/include/linux/mount.h	2005-04-24 10:24:29.000000000 +0200
@@ -19,6 +19,7 @@
 #define MNT_NOSUID	1
 #define MNT_NODEV	2
 #define MNT_NOEXEC	4
+#define MNT_PRIVATE	8
 
 struct vfsmount
 {
@@ -31,6 +32,7 @@ struct vfsmount
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	atomic_t mnt_count;
 	int mnt_flags;
+	uid_t mnt_uid;
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	char *mnt_devname;		/* Name of device e.g. /dev/dsk/hda1 */
 	struct list_head mnt_list;

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:08 [PATCH] private mounts Miklos Szeredi
@ 2005-04-24 20:13 ` Al Viro
  2005-04-24 20:45   ` Miklos Szeredi
  2005-04-24 20:18 ` Christoph Hellwig
  1 sibling, 1 reply; 95+ messages in thread
From: Al Viro @ 2005-04-24 20:13 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: linux-fsdevel, hch, linux-kernel, akpm

On Sun, Apr 24, 2005 at 10:08:13PM +0200, Miklos Szeredi wrote:
> Comments are appreciated.  If there are no vetoes agains the patch, I
> think it's suitable for -mm.

Vetoed.  Having suid application with different pathname resolution than
that of parent just because it is suid is not acceptable.  I'm sorry,
but breaking hell knows how many existing applications is not an option.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:08 [PATCH] private mounts Miklos Szeredi
  2005-04-24 20:13 ` Al Viro
@ 2005-04-24 20:18 ` Christoph Hellwig
  2005-04-24 20:50   ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2005-04-24 20:18 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: linux-fsdevel, hch, linux-kernel, akpm

On Sun, Apr 24, 2005 at 10:08:13PM +0200, Miklos Szeredi wrote:
> This simple patch adds support for private (or invisible) mounts.  The
> rationale is to allow mounts to be private for a user but still in the
> global namespace.

As mentioned in the last -fsdevel thread a few times the idea of per-user
mounts is fundamentally flawed.  Crossing a namespace boundary must be
explicit - using clone or a new unshare() syscall.


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:13 ` Al Viro
@ 2005-04-24 20:45   ` Miklos Szeredi
  0 siblings, 0 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 20:45 UTC (permalink / raw)
  To: viro; +Cc: linux-fsdevel, hch, linux-kernel, akpm

> > Comments are appreciated.  If there are no vetoes agains the patch, I
> > think it's suitable for -mm.
> 
> Vetoed.  Having suid application with different pathname resolution than
> that of parent just because it is suid is not acceptable.  I'm sorry,
> but breaking hell knows how many existing applications is not an option.

I'm pretty sure any suid program doing path resolution and other
filesystem operations on _behalf_ of the original user will do them
with fsuid, fsgid set to the original.  Otherwise they are bound to
break in other cases too (NFS export with root_sqash, etc).

Have any counterexamples?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:18 ` Christoph Hellwig
@ 2005-04-24 20:50   ` Miklos Szeredi
  2005-04-24 20:54     ` Al Viro
  0 siblings, 1 reply; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 20:50 UTC (permalink / raw)
  To: hch; +Cc: linux-fsdevel, hch, linux-kernel, akpm

> > This simple patch adds support for private (or invisible) mounts.  The
> > rationale is to allow mounts to be private for a user but still in the
> > global namespace.
> 
> As mentioned in the last -fsdevel thread a few times the idea of per-user
> mounts is fundamentally flawed.  Crossing a namespace boundary must be
> explicit - using clone or a new unshare() syscall.

Also mentioned in that thread quite a few times is the fact the the
clone() and unshare() modell does not solve people's requirements.

Care to read through that thread and suggest an alternative solution?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:50   ` Miklos Szeredi
@ 2005-04-24 20:54     ` Al Viro
  2005-04-24 20:59       ` Miklos Szeredi
  0 siblings, 1 reply; 95+ messages in thread
From: Al Viro @ 2005-04-24 20:54 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, linux-fsdevel, linux-kernel, akpm

On Sun, Apr 24, 2005 at 10:50:04PM +0200, Miklos Szeredi wrote:
> Also mentioned in that thread quite a few times is the fact the the
> clone() and unshare() modell does not solve people's requirements.

Could we please get of references to requirements without a rationale?
There's quite enough of that from Carrion-Grade Linux crowd, TYVM.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:54     ` Al Viro
@ 2005-04-24 20:59       ` Miklos Szeredi
  2005-04-24 21:06         ` Christoph Hellwig
  2005-04-24 21:06         ` Al Viro
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 20:59 UTC (permalink / raw)
  To: viro; +Cc: hch, linux-fsdevel, linux-kernel, akpm

> > Also mentioned in that thread quite a few times is the fact the the
> > clone() and unshare() modell does not solve people's requirements.
> 
> Could we please get of references to requirements without a rationale?
> There's quite enough of that from Carrion-Grade Linux crowd, TYVM.

The rationale has been explained in that thread.  E.g. this quote from
Jamie Lokier in an answer to you:

> I believe the point is:
> 
>    1. Person is logged from client Y to server X, and mounts something on
>       $HOME/mnt/private (that's on X).
> 
>    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
>       and wants it to work.
> 
> The second operation is a separate login to the first.

Solution?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:59       ` Miklos Szeredi
  2005-04-24 21:06         ` Christoph Hellwig
@ 2005-04-24 21:06         ` Al Viro
  2005-04-24 21:15           ` Miklos Szeredi
  2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
  1 sibling, 2 replies; 95+ messages in thread
From: Al Viro @ 2005-04-24 21:06 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, linux-fsdevel, linux-kernel, akpm

On Sun, Apr 24, 2005 at 10:59:46PM +0200, Miklos Szeredi wrote:
> > I believe the point is:
> > 
> >    1. Person is logged from client Y to server X, and mounts something on
> >       $HOME/mnt/private (that's on X).
> > 
> >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> >       and wants it to work.
> > 
> > The second operation is a separate login to the first.
> 
> Solution?

... is the same as for the same question with "set of mounts" replaced
with "environment variables".

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 20:59       ` Miklos Szeredi
@ 2005-04-24 21:06         ` Christoph Hellwig
  2005-04-24 21:12           ` Jamie Lokier
  2005-04-24 21:06         ` Al Viro
  1 sibling, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2005-04-24 21:06 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

On Sun, Apr 24, 2005 at 10:59:46PM +0200, Miklos Szeredi wrote:
> > Could we please get of references to requirements without a rationale?
> > There's quite enough of that from Carrion-Grade Linux crowd, TYVM.
> 
> The rationale has been explained in that thread.  E.g. this quote from
> Jamie Lokier in an answer to you:

You still haven't written down coheren requirements.

> 
> > I believe the point is:
> > 
> >    1. Person is logged from client Y to server X, and mounts something on
> >       $HOME/mnt/private (that's on X).
> > 
> >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> >       and wants it to work.
> > 
> > The second operation is a separate login to the first.
> 
> Solution?

just restart your shell.  Same way you do that after adjusting $PATH.


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:06         ` Christoph Hellwig
@ 2005-04-24 21:12           ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-24 21:12 UTC (permalink / raw)
  To: Christoph Hellwig, Miklos Szeredi, viro, linux-fsdevel,
	linux-kernel, akpm

Christoph Hellwig wrote:
> > > I believe the point is:
> > > 
> > >    1. Person is logged from client Y to server X, and mounts something on
> > >       $HOME/mnt/private (that's on X).
> > > 
> > >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> > >       and wants it to work.
> > > 
> > > The second operation is a separate login to the first.
> > 
> > Solution?
> 
> just restart your shell.  Same way you do that after adjusting $PATH.

What do you mean?

I cannot think of any way restarting the shell would solve the above.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:06         ` Al Viro
@ 2005-04-24 21:15           ` Miklos Szeredi
  2005-04-24 21:19             ` Al Viro
  2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
  1 sibling, 1 reply; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 21:15 UTC (permalink / raw)
  To: viro; +Cc: hch, linux-fsdevel, linux-kernel, akpm

> > > I believe the point is:
> > > 
> > >    1. Person is logged from client Y to server X, and mounts something on
> > >       $HOME/mnt/private (that's on X).
> > > 
> > >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> > >       and wants it to work.
> > > 
> > > The second operation is a separate login to the first.
> > 
> > Solution?
> 
> ... is the same as for the same question with "set of mounts" replaced
> with "environment variables".

No.  You can't set "mount environment" in scp.

Otherwise your analogy is nice, but misses a few points.  The usage of
mounts that we are talking about is much more dynamic than usage of
environment variables.  You wouldn't want to set an environment
variable in all your shells just to access a remote system though
sshfs for example.  It _is_ possible (except the ftp, scp case) but
_very_ inconvenient.

I ask again, what solution would you suggest?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:15           ` Miklos Szeredi
@ 2005-04-24 21:19             ` Al Viro
  2005-04-24 21:29               ` Miklos Szeredi
                                 ` (3 more replies)
  0 siblings, 4 replies; 95+ messages in thread
From: Al Viro @ 2005-04-24 21:19 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, linux-fsdevel, linux-kernel, akpm

On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> No.  You can't set "mount environment" in scp.

Of course you can.  It does execute the obvious set of rc files.
 
> Otherwise your analogy is nice, but misses a few points.  The usage of
> mounts that we are talking about is much more dynamic than usage of
> environment variables.

What the hell are you smoking and just how are you using shell?

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:19             ` Al Viro
@ 2005-04-24 21:29               ` Miklos Szeredi
  2005-04-24 21:39                 ` Jamie Lokier
  2005-04-25  7:10                 ` Jan Hudec
  2005-04-24 21:43               ` Jamie Lokier
                                 ` (2 subsequent siblings)
  3 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-24 21:29 UTC (permalink / raw)
  To: viro; +Cc: hch, linux-fsdevel, linux-kernel, akpm

> On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> > No.  You can't set "mount environment" in scp.
> 
> Of course you can.  It does execute the obvious set of rc files.

Don't think so.  ftp server and sftp server sure as hell don't.

> > Otherwise your analogy is nice, but misses a few points.  The usage of
> > mounts that we are talking about is much more dynamic than usage of
> > environment variables.
> 
> What the hell are you smoking and just how are you using shell?

Maybe differently from you :).  It's not that often that I have to
tweak environment variables.  They are usually set by scripts.

However if you write me a script that reads my mind as to which server
I want to mount with sshfs at which time, I give you all my respect.

Miklos


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:06         ` Al Viro
  2005-04-24 21:15           ` Miklos Szeredi
@ 2005-04-24 21:38           ` Jamie Lokier
  2005-04-24 22:20             ` Ram
                               ` (2 more replies)
  1 sibling, 3 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-24 21:38 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Al Viro wrote:
> > > I believe the point is:
> > > 
> > >    1. Person is logged from client Y to server X, and mounts something on
> > >       $HOME/mnt/private (that's on X).
> > > 
> > >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> > >       and wants it to work.
> > > 
> > > The second operation is a separate login to the first.
> > 
> > Solution?
> 
> ... is the same as for the same question with "set of mounts" replaced
> with "environment variables".

Not quite.

After changing environment variables in .profile, you can copy them to
other shells using ". ~/.profile".

There is no analogous mechanism to copy namespaces.

I agree with you that Miklos' patch is not the right way to do it.

Much better is the proposal to make namespaces first-class objects,
that can be switched to.  Then users can choose to have themselves a
namespace containing their private mounts, if they want it, with
login/libpam or even a program run from .profile switching into it.

While users can be allowed to create their own namespaces which affect
the path traversal of their _own_ directories, it's important that the
existence of such namespaces cannot affect path traversal of other
directories such as /etc, or /autofs/whatever - and that creation of
namespaces by a user cannot prevent the unmounting of a non-user
filesystem either.

The way to do that is shared subtrees, or something along those lines.

Here is one possible implementation:

As far as I can tell, namespaces are equivalent to predicates attached
to every mount - the predicate being "this mount intercepts path
traversal at this point if current namespace == X".

It makes sense, when users can create namespaces for themselves, that
the predicate be changed to "this mount valid if [list of current
namespace and all parent namespaces] contains X".  Parent namespace
means the namespace from which a CLONE_NS namespace inherits.

Then it would be safe (i.e. secure) to allow ordinary users to use
CLONE_NS for the purpose of establishing private namespace(s), within
which they can mount things on directories they own.  But those users
would continue to see mounts & unmounts done by the system in other
directories such as /mnt and /autofs.  Effectively this confines the
new namespace to only affecting directories owned by the user.

That would work properly with suid programs, properly with autofs and
also manual system-wide administration, and it is general enough that
it doesn't force any particular policy.  Also, it would be usable for
partial sharing of resources in virtual server and chroot scenarios.
What's not to like? :)

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:29               ` Miklos Szeredi
@ 2005-04-24 21:39                 ` Jamie Lokier
  2005-04-25  7:10                 ` Jan Hudec
  1 sibling, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-24 21:39 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

Miklos Szeredi wrote:
> > On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> > > No.  You can't set "mount environment" in scp.
> > 
> > Of course you can.  It does execute the obvious set of rc files.
> 
> Don't think so.  ftp server and sftp server sure as hell don't.

That's no argument, because you are free to change the ftp and sftp
servers to add this behaviour if you want it.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:19             ` Al Viro
  2005-04-24 21:29               ` Miklos Szeredi
@ 2005-04-24 21:43               ` Jamie Lokier
  2005-04-25  7:14                 ` Jan Hudec
  2005-04-27  9:14                 ` Helge Hafting
  2005-04-25  9:48               ` Olivier Galibert
  2005-04-25 21:09               ` Bryan Henderson
  3 siblings, 2 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-24 21:43 UTC (permalink / raw)
  To: Al Viro; +Cc: Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Al Viro wrote:
> On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> > No.  You can't set "mount environment" in scp.
> 
> Of course you can.  It does execute the obvious set of rc files.

It doesn't work for the specified use-scenario.  The reason is that
there is no command or system call that can be executed from those rc
files to join an existing namespace.

He wants to do this:

   1. From client, login to server and do a usermount on $HOME/private.

   2. From client, login to server and read the files previously mounted.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
@ 2005-04-24 22:20             ` Ram
  2005-04-24 22:22               ` Jamie Lokier
  2005-04-25  6:00             ` Miklos Szeredi
  2005-04-25 15:20             ` Pavel Machek
  2 siblings, 1 reply; 95+ messages in thread
From: Ram @ 2005-04-24 22:20 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel,
	Andrew Morton

On Sun, 2005-04-24 at 14:38, Jamie Lokier wrote:
> Al Viro wrote:
> > > > I believe the point is:
> > > > 
> > > >    1. Person is logged from client Y to server X, and mounts something on
> > > >       $HOME/mnt/private (that's on X).
> > > > 
> > > >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> > > >       and wants it to work.
> > > > 
> > > > The second operation is a separate login to the first.
> > > 
> > > Solution?
> > 
> > ... is the same as for the same question with "set of mounts" replaced
> > with "environment variables".
> 
> Not quite.
> 
> After changing environment variables in .profile, you can copy them to
> other shells using ". ~/.profile".
> 
> There is no analogous mechanism to copy namespaces.
> 
> I agree with you that Miklos' patch is not the right way to do it.
> 
> Much better is the proposal to make namespaces first-class objects,
> that can be switched to.  Then users can choose to have themselves a
> namespace containing their private mounts, if they want it, with
> login/libpam or even a program run from .profile switching into it.
> 
> While users can be allowed to create their own namespaces which affect
> the path traversal of their _own_ directories, it's important that the
> existence of such namespaces cannot affect path traversal of other
> directories such as /etc, or /autofs/whatever - and that creation of
> namespaces by a user cannot prevent the unmounting of a non-user
> filesystem either.
> 
> The way to do that is shared subtrees, or something along those lines.

Right. Adding to it. To begin with the system namespace has all its
entire tree shared. So when a new namespace is cloned, the new namespace
can see any new mount/unmount/binds done in the system namespace as
well. (System namespace is the first initial namespace created by
default).

Any private mounts done by the user in his private-namespace 
will first make that part of the tree private first and then will
continue with the mount. Otherwise the private mount will end up showing
in the system namespace(since it is shared).

RP
> 
> Here is one possible implementation:
> 
> As far as I can tell, namespaces are equivalent to predicates attached
> to every mount - the predicate being "this mount intercepts path
> traversal at this point if current namespace == X".
> 
> It makes sense, when users can create namespaces for themselves, that
> the predicate be changed to "this mount valid if [list of current
> namespace and all parent namespaces] contains X".  Parent namespace
> means the namespace from which a CLONE_NS namespace inherits.
> 
> Then it would be safe (i.e. secure) to allow ordinary users to use
> CLONE_NS for the purpose of establishing private namespace(s), within
> which they can mount things on directories they own.  But those users
> would continue to see mounts & unmounts done by the system in other
> directories such as /mnt and /autofs.  Effectively this confines the
> new namespace to only affecting directories owned by the user.
> 
> That would work properly with suid programs, properly with autofs and
> also manual system-wide administration, and it is general enough that
> it doesn't force any particular policy.  Also, it would be usable for
> partial sharing of resources in virtual server and chroot scenarios.
> What's not to like? :)


> 
> -- Jamie
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 22:20             ` Ram
@ 2005-04-24 22:22               ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-24 22:22 UTC (permalink / raw)
  To: Ram
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel,
	Andrew Morton

Ram wrote:
> > Much better is the proposal to make namespaces first-class objects,
> > that can be switched to.  Then users can choose to have themselves a
> > namespace containing their private mounts, if they want it, with
> > login/libpam or even a program run from .profile switching into it.
> > 
> > While users can be allowed to create their own namespaces which affect
> > the path traversal of their _own_ directories, it's important that the
> > existence of such namespaces cannot affect path traversal of other
> > directories such as /etc, or /autofs/whatever - and that creation of
> > namespaces by a user cannot prevent the unmounting of a non-user
> > filesystem either.
> > 
> > The way to do that is shared subtrees, or something along those lines.
> 
> Right. Adding to it. To begin with the system namespace has all its
> entire tree shared. So when a new namespace is cloned, the new namespace
> can see any new mount/unmount/binds done in the system namespace as
> well. (System namespace is the first initial namespace created by
> default).
> 
> Any private mounts done by the user in his private-namespace 
> will first make that part of the tree private first and then will
> continue with the mount. Otherwise the private mount will end up showing
> in the system namespace(since it is shared).

Yes, exactly that.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
  2005-04-24 22:20             ` Ram
@ 2005-04-25  6:00             ` Miklos Szeredi
  2005-04-25  6:41               ` Ram
  2005-04-25  7:22               ` Jan Hudec
  2005-04-25 15:20             ` Pavel Machek
  2 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-25  6:00 UTC (permalink / raw)
  To: jamie; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

> > 
> > ... is the same as for the same question with "set of mounts" replaced
> > with "environment variables".
> 
> Not quite.
> 
> After changing environment variables in .profile, you can copy them to
> other shells using ". ~/.profile".
> 
> There is no analogous mechanism to copy namespaces.
> 
> I agree with you that Miklos' patch is not the right way to do it.

I'm not sure that it is either.  But, see bellow...

> Much better is the proposal to make namespaces first-class objects,
> that can be switched to.  Then users can choose to have themselves a
> namespace containing their private mounts, if they want it, with
> login/libpam or even a program run from .profile switching into it.

It would be good if it could be done just in libpam.  But that would
require every libpam user to call into it after the fork() or
whatever, so unshare() and join_namespace() don't mess up the server
running environment.

If not, then it would mean modifying numerous programs, having these
modifications integrated, then having distributions pick up the
changes, etc.  I would imagine quite a long cycle for this to be
acutally useful.

> While users can be allowed to create their own namespaces which affect
> the path traversal of their _own_ directories, it's important that the
> existence of such namespaces cannot affect path traversal of other
> directories such as /etc, or /autofs/whatever - and that creation of
> namespaces by a user cannot prevent the unmounting of a non-user
> filesystem either.
> 
> The way to do that is shared subtrees, or something along those lines.

Yes, but we would be achieving essentially the same as my patch, just
with more complexity.  And my patch achieves what FUSE does in 2 lines
of code, namely hide the mount from other users by returning -EACCESS
in case fsuid does not mach the mount owner.

I aggree that your solution is more flexible, but it's also hugely
more complex.  If somebody want's to implement it, fine.  But don't
expect me to do it, unless some company hires my for fs development
(hint, hint ;)

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  6:00             ` Miklos Szeredi
@ 2005-04-25  6:41               ` Ram
  2005-04-25  9:55                 ` Miklos Szeredi
  2005-04-25  7:22               ` Jan Hudec
  1 sibling, 1 reply; 95+ messages in thread
From: Ram @ 2005-04-25  6:41 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: jamie, viro, hch, linux-fsdevel, linux-kernel, Andrew Morton

On Sun, 2005-04-24 at 23:00, Miklos Szeredi wrote:
> > > 
> > > ... is the same as for the same question with "set of mounts" replaced
> > > with "environment variables".
> > 
> > Not quite.
> > 
> > After changing environment variables in .profile, you can copy them to
> > other shells using ". ~/.profile".
> > 
> > There is no analogous mechanism to copy namespaces.
> > 
> > I agree with you that Miklos' patch is not the right way to do it.
> 
> I'm not sure that it is either.  But, see bellow...
> 
> > Much better is the proposal to make namespaces first-class objects,
> > that can be switched to.  Then users can choose to have themselves a
> > namespace containing their private mounts, if they want it, with
> > login/libpam or even a program run from .profile switching into it.
> 
> It would be good if it could be done just in libpam.  But that would
> require every libpam user to call into it after the fork() or
> whatever, so unshare() and join_namespace() don't mess up the server
> running environment.
> 
> If not, then it would mean modifying numerous programs, having these
> modifications integrated, then having distributions pick up the
> changes, etc.  I would imagine quite a long cycle for this to be
> acutally useful.
> 
> > While users can be allowed to create their own namespaces which affect
> > the path traversal of their _own_ directories, it's important that the
> > existence of such namespaces cannot affect path traversal of other
> > directories such as /etc, or /autofs/whatever - and that creation of
> > namespaces by a user cannot prevent the unmounting of a non-user
> > filesystem either.
> > 
> > The way to do that is shared subtrees, or something along those lines.
> 
> Yes, but we would be achieving essentially the same as my patch, just
> with more complexity.  And my patch achieves what FUSE does in 2 lines
> of code, namely hide the mount from other users by returning -EACCESS
> in case fsuid does not mach the mount owner.
> 

I have not yet sure how invisible mount can be used to solve the FUSE
problem.  

Again my understanding of the basic requirement of FUSE is:

1. A user being able to setup his own VFS-mount environment which
  	 is only visible to the user. 
2. The same user being able to see exactly the same VFS-mount  
	environment from any login session.

RP

> I aggree that your solution is more flexible, but it's also hugely
> more complex.  If somebody want's to implement it, fine.  But don't
> expect me to do it, unless some company hires my for fs development
> (hint, hint ;) 



> 
> Thanks,
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:29               ` Miklos Szeredi
  2005-04-24 21:39                 ` Jamie Lokier
@ 2005-04-25  7:10                 ` Jan Hudec
  2005-04-25  9:58                   ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Jan Hudec @ 2005-04-25  7:10 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

[-- Attachment #1: Type: text/plain, Size: 1317 bytes --]

On Sun, Apr 24, 2005 at 23:29:22 +0200, Miklos Szeredi wrote:
> > On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> > > No.  You can't set "mount environment" in scp.
> > 
> > Of course you can.  It does execute the obvious set of rc files.
> 
> Don't think so.  ftp server and sftp server sure as hell don't.

Sftp sure *DOES*. It is invoked by shell, which is not run as login one,
but even non-login shell sources an rc file.

> > > Otherwise your analogy is nice, but misses a few points.  The usage of
> > > mounts that we are talking about is much more dynamic than usage of
> > > environment variables.
> > 
> > What the hell are you smoking and just how are you using shell?
> 
> Maybe differently from you :).  It's not that often that I have to
> tweak environment variables.  They are usually set by scripts.
> 
> However if you write me a script that reads my mind as to which server
> I want to mount with sshfs at which time, I give you all my respect.

I can't write a script that reads your mind. But I sure can write
a script that finds out what you mounted in the other shells (with help
of a little wrapper around the mount command).

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:43               ` Jamie Lokier
@ 2005-04-25  7:14                 ` Jan Hudec
  2005-04-27  9:14                 ` Helge Hafting
  1 sibling, 0 replies; 95+ messages in thread
From: Jan Hudec @ 2005-04-25  7:14 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

[-- Attachment #1: Type: text/plain, Size: 947 bytes --]

On Sun, Apr 24, 2005 at 22:43:39 +0100, Jamie Lokier wrote:
> Al Viro wrote:
> > On Sun, Apr 24, 2005 at 11:15:35PM +0200, Miklos Szeredi wrote:
> > > No.  You can't set "mount environment" in scp.
> > 
> > Of course you can.  It does execute the obvious set of rc files.
> 
> It doesn't work for the specified use-scenario.  The reason is that
> there is no command or system call that can be executed from those rc
> files to join an existing namespace.
> 
> He wants to do this:
> 
>    1. From client, login to server and do a usermount on $HOME/private.
> 
>    2. From client, login to server and read the files previously mounted.

Ok, that almost can be done. All that is needed from kernel is an
ability to mount bind from open directory handle instead of a path! The
rest is doable in userland.

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  6:00             ` Miklos Szeredi
  2005-04-25  6:41               ` Ram
@ 2005-04-25  7:22               ` Jan Hudec
  2005-04-25 10:08                 ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Jan Hudec @ 2005-04-25  7:22 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: jamie, viro, hch, linux-fsdevel, linux-kernel, akpm

[-- Attachment #1: Type: text/plain, Size: 1140 bytes --]

On Mon, Apr 25, 2005 at 08:00:20 +0200, Miklos Szeredi wrote:
> > Much better is the proposal to make namespaces first-class objects,
> > that can be switched to.  Then users can choose to have themselves a
> > namespace containing their private mounts, if they want it, with
> > login/libpam or even a program run from .profile switching into it.
> 
> It would be good if it could be done just in libpam.  But that would
> require every libpam user to call into it after the fork() or
> whatever, so unshare() and join_namespace() don't mess up the server
> running environment.

They do. The *HAVE* to do! The 'session' stage modifies the environment,
so it must be done after the fork. So if it, in addition to environment,
modifies namespace, it won't make a difference.

> If not, then it would mean modifying numerous programs, having these
> modifications integrated, then having distributions pick up the
> changes, etc.  I would imagine quite a long cycle for this to be
> acutally useful.

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:19             ` Al Viro
  2005-04-24 21:29               ` Miklos Szeredi
  2005-04-24 21:43               ` Jamie Lokier
@ 2005-04-25  9:48               ` Olivier Galibert
  2005-04-25 16:37                 ` Tim Hockin
  2005-04-30  8:37                 ` Christoph Hellwig
  2005-04-25 21:09               ` Bryan Henderson
  3 siblings, 2 replies; 95+ messages in thread
From: Olivier Galibert @ 2005-04-25  9:48 UTC (permalink / raw)
  To: linux-fsdevel, linux-kernel

On Sun, Apr 24, 2005 at 10:19:42PM +0100, Al Viro wrote:
> Of course you can.  It does execute the obvious set of rc files.

Is there a possibility for a process to change its namespace to
another existing one?  That would be needed to have a per-user
namespace you go to from rc files or pam.

I'd kinda wonder what happens to pwd.

  OG.


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  6:41               ` Ram
@ 2005-04-25  9:55                 ` Miklos Szeredi
  0 siblings, 0 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-25  9:55 UTC (permalink / raw)
  To: linuxram; +Cc: jamie, viro, hch, linux-fsdevel, linux-kernel, akpm

> I have not yet sure how invisible mount can be used to solve the FUSE
> problem.  
> 
> Again my understanding of the basic requirement of FUSE is:
> 
> 1. A user being able to setup his own VFS-mount environment which
>   	 is only visible to the user. 
> 2. The same user being able to see exactly the same VFS-mount  
> 	environment from any login session.

More generally: 

1. the files exported by the FUSE filesystem should not be accessible
   by other users.

2. The user should see exactly the same files from any login session.

These can be satisfied in various ways.  Permission checking, or by
making FUSE mounts invisible to other users, or with private
namespaces (in increasing complexity).

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  7:10                 ` Jan Hudec
@ 2005-04-25  9:58                   ` Miklos Szeredi
  2005-04-25 11:45                     ` Jan Hudec
  2005-04-30  8:35                     ` Christoph Hellwig
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-25  9:58 UTC (permalink / raw)
  To: bulb; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

> > Don't think so.  ftp server and sftp server sure as hell don't.
> 
> Sftp sure *DOES*. It is invoked by shell, which is not run as login one,
> but even non-login shell sources an rc file.

You win :)

> > However if you write me a script that reads my mind as to which server
> > I want to mount with sshfs at which time, I give you all my respect.
> 
> I can't write a script that reads your mind. But I sure can write
> a script that finds out what you mounted in the other shells (with help
> of a little wrapper around the mount command).

How do you bind mount it from a different namespace?  You _do_ need
bind mount, since a new mount might require password input, etc...

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  7:22               ` Jan Hudec
@ 2005-04-25 10:08                 ` Miklos Szeredi
  0 siblings, 0 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-25 10:08 UTC (permalink / raw)
  To: bulb; +Cc: jamie, viro, hch, linux-fsdevel, linux-kernel, akpm

> They do. The *HAVE* to do! The 'session' stage modifies the environment,
> so it must be done after the fork. So if it, in addition to environment,
> modifies namespace, it won't make a difference.

That is good news.

So in theory it's doable.  Anyone willing to help putting it all
together?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  9:58                   ` Miklos Szeredi
@ 2005-04-25 11:45                     ` Jan Hudec
  2005-04-30  8:35                     ` Christoph Hellwig
  1 sibling, 0 replies; 95+ messages in thread
From: Jan Hudec @ 2005-04-25 11:45 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: viro, hch, linux-fsdevel, linux-kernel, akpm

[-- Attachment #1: Type: text/plain, Size: 916 bytes --]

On Mon, Apr 25, 2005 at 11:58:50 +0200, Miklos Szeredi wrote:
> > > However if you write me a script that reads my mind as to which server
> > > I want to mount with sshfs at which time, I give you all my respect.
> > 
> > I can't write a script that reads your mind. But I sure can write
> > a script that finds out what you mounted in the other shells (with help
> > of a little wrapper around the mount command).
> 
> How do you bind mount it from a different namespace?  You _do_ need
> bind mount, since a new mount might require password input, etc...

Yes, I would need one thing from kernel. That one thing would be to
mount bind a directory handle, instead of path.

And if you wonder how I get the handle, that's what SCM_RIGHTS message
of unix-domain sockets is for.

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
  2005-04-24 22:20             ` Ram
  2005-04-25  6:00             ` Miklos Szeredi
@ 2005-04-25 15:20             ` Pavel Machek
  2005-04-25 19:07               ` Jamie Lokier
  2 siblings, 1 reply; 95+ messages in thread
From: Pavel Machek @ 2005-04-25 15:20 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Hi!

> > > > I believe the point is:
> > > > 
> > > >    1. Person is logged from client Y to server X, and mounts something on
> > > >       $HOME/mnt/private (that's on X).
> > > > 
> > > >    2. On client Y, person does "scp X:mnt/private/secrets.txt ."
> > > >       and wants it to work.
> > > > 
> > > > The second operation is a separate login to the first.
> > > 
> > > Solution?
> > 
> > ... is the same as for the same question with "set of mounts" replaced
> > with "environment variables".
> 
> Not quite.
> 
> After changing environment variables in .profile, you can copy them to
> other shells using ". ~/.profile".
> 
> There is no analogous mechanism to copy namespaces.

Actually, after you add right mount xyzzy /foo lines into .profile,
you can just . ~/.profile ;-).
								Pavel

-- 
Boycott Kodak -- for their patent abuse against Java.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  9:48               ` Olivier Galibert
@ 2005-04-25 16:37                 ` Tim Hockin
  2005-04-30  8:37                 ` Christoph Hellwig
  1 sibling, 0 replies; 95+ messages in thread
From: Tim Hockin @ 2005-04-25 16:37 UTC (permalink / raw)
  To: Olivier Galibert, linux-fsdevel, linux-kernel

On Mon, Apr 25, 2005 at 11:48:04AM +0200, Olivier Galibert wrote:
> Is there a possibility for a process to change its namespace to
> another existing one?  That would be needed to have a per-user
> namespace you go to from rc files or pam.

I haven't looked at this in about a year, but as of a year ago, no.
Namespaces are/were second-class objects that exist only as referenced by
tasks.  I played with implementing a newns PAM module.  It worked, but was
full of holes.  I started writing a paper on it, but never got around to
finishing it, for various reasons.

Tim

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25 15:20             ` Pavel Machek
@ 2005-04-25 19:07               ` Jamie Lokier
  2005-04-26  9:29                 ` Pavel Machek
  2005-04-30  8:33                 ` [PATCH] private mounts Christoph Hellwig
  0 siblings, 2 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-25 19:07 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Pavel Machek wrote:
> > > ... is the same as for the same question with "set of mounts" replaced
> > > with "environment variables".
> > 
> > Not quite.
> > 
> > After changing environment variables in .profile, you can copy them to
> > other shells using ". ~/.profile".
> > 
> > There is no analogous mechanism to copy namespaces.
> 
> Actually, after you add right mount xyzzy /foo lines into .profile,
> you can just . ~/.profile ;-).

Is there a mount command that can do that?  We're talking about
private mounts - invisible to other namespaces, which includes the
other shells.

If there was a /proc/NNN/namespace, that would do the trick :)

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:19             ` Al Viro
                                 ` (2 preceding siblings ...)
  2005-04-25  9:48               ` Olivier Galibert
@ 2005-04-25 21:09               ` Bryan Henderson
  2005-04-26 13:46                 ` filesystem transactions API Ville Herva
  3 siblings, 1 reply; 95+ messages in thread
From: Bryan Henderson @ 2005-04-25 21:09 UTC (permalink / raw)
  To: Al Viro
  Cc: akpm, hch, linux-fsdevel, linux-fsdevel-owner, linux-kernel,
	Miklos Szeredi

>> No.  You can't set "mount environment" in scp.
>
>Of course you can.  It does execute the obvious set of rc files.

Incidentally, there is no obvious set of files.  The only relevant one 
that gets executed does so by accident because of a side effect of an ugly 
hack.

Jamie pointed out that such files wouldn't really help anyway, because 
there isn't a shell command that can affect the mounts seen by the copy 
server process it forks.  And others have noted that some such remote 
processes don't run shells at all.  But in case anyone is thinking of 
shell rc files as an architectural solution to the scp problem, let me 
explain shell rc files, in particular Bash's:

.profile runs when a login shell starts, which is supposed to be when you 
start a work session with the computer.  You put stuff in there like an 
announcement of mail, displaying reminders, reading news, etc.

/etc/profile is the same, but for everyone.

.bashrc runs when an interactive shell starts that isn't a login shell, 
which is supposed to be as in opening  a new shell window.  You put stuff 
in there to customize your interactive experience -- key binding, screen 
colors, aliases, and the like.

Some builds of Bash have a system level version of this as 
/etc/bash.bashrc.

All of these are for shells that are being used by a human.  They can 
really mess up a "user" that is a machine.  The most important case of a 
non-human user is a shell script.

The rc file named by the BASH_ENV environment variable runs for every 
shell, interactive or not.  But this is hard to use for personalization 
because you need a place to personalize BASH_ENV.  It's also hard to use 
for anything else, because so many programs (including some Ssh daemons) 
cut off environment variable inheritance.

Now for the ugly hack:  An interactive shell is normally one whose 
Standard Input is a terminal.  But when rsh came about, Standard Input was 
a socket, even though the shell session was quite interactive.  So Bash 
contains code that looks at several conditions consistent with an rsh 
session and if it determines that it is probably being run as the backend 
of an rsh session, it treats the shell as interactive.  Openssh 'ssh' 
doesn't need this hack, because Sshd uses a pseudo-terminal instead of a 
socket as the shell's Standard Input.  But Openssh's 'scp' falls into the 
trap and gets taken as an interactive human user of the shell.  So .bashrc 
runs.  Many are the scp sessions I've tortured with my .bashrc, and spent 
hours debugging.  (I finally removed the hack from Bash and regained 
sanity).

A design for user-specific namespaces that relies on this particular hack 
would not be clean.

On the other hand, it is possible to customize any scp backend session 
just by making a personal wrapper for the scp backend program.  The 
wrapper can do the setup -- either directly or by running an "scprc" file. 
 With Openssh, you can choose the backend program in various places.

--
Bryan Henderson                          IBM Almaden Research Center
San Jose CA                              Filesystems

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25 19:07               ` Jamie Lokier
@ 2005-04-26  9:29                 ` Pavel Machek
  2005-04-26 14:07                   ` Jamie Lokier
  2005-04-30  8:33                 ` [PATCH] private mounts Christoph Hellwig
  1 sibling, 1 reply; 95+ messages in thread
From: Pavel Machek @ 2005-04-26  9:29 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Hi!

> > > > ... is the same as for the same question with "set of mounts" replaced
> > > > with "environment variables".
> > > 
> > > Not quite.
> > > 
> > > After changing environment variables in .profile, you can copy them to
> > > other shells using ". ~/.profile".
> > > 
> > > There is no analogous mechanism to copy namespaces.
> > 
> > Actually, after you add right mount xyzzy /foo lines into .profile,
> > you can just . ~/.profile ;-).
> 
> Is there a mount command that can do that?  We're talking about
> private mounts - invisible to other namespaces, which includes the
> other shells.
> 
> If there was a /proc/NNN/namespace, that would do the trick :)

Sounds like the solution, then. I do not think Al Viro is going to
kill you for /proc/NNN/namespace...
								Pavel
-- 
Boycott Kodak -- for their patent abuse against Java.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* filesystem transactions API
  2005-04-25 21:09               ` Bryan Henderson
@ 2005-04-26 13:46                 ` Ville Herva
  2005-04-26 14:14                   ` Jamie Lokier
  2005-04-26 14:25                   ` Trond Myklebust
  0 siblings, 2 replies; 95+ messages in thread
From: Ville Herva @ 2005-04-26 13:46 UTC (permalink / raw)
  To: linux-fsdevel, linux-kernel

Apparently, Windows Longhorn will include something called "transactional
NTFS". It's explained pretty well in

   http://blogs.msdn.com/because_we_can/

Basically, a process can create a fs transaction, and all fs changes made
between start of the transaction and commit are atomical - meaning nothing
is visible until commit, and if commit fails, everything is rolled back.

Sound useful... Although there are no service pack installs that could fail
in Linux, the same thing could be useful in rpm, yum, almost anything. 

What do you think?

-- v -- 

v@iki.fi

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-26  9:29                 ` Pavel Machek
@ 2005-04-26 14:07                   ` Jamie Lokier
  2005-04-28 13:28                     ` Eric Van Hensbergen
  2005-04-28 13:47                     ` Eric Van Hensbergen
  0 siblings, 2 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 14:07 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Pavel Machek wrote:
> > > Actually, after you add right mount xyzzy /foo lines into .profile,
> > > you can just . ~/.profile ;-).
> > 
> > Is there a mount command that can do that?  We're talking about
> > private mounts - invisible to other namespaces, which includes the
> > other shells.
> > 
> > If there was a /proc/NNN/namespace, that would do the trick :)
> 
> Sounds like the solution, then. I do not think Al Viro is going to
> kill you for /proc/NNN/namespace...

Looking closer, I think we already have it.

It's called /proc/NNN/root.

Does chroot into /proc/NNN/root cause the chroot'ing process to adopt
the namespace of NNN?  Looking at the code, I think it does.

Furthermore, I think a daemon can acquire file descriptors for
multiple namespaces already, by open("/") and passing descriptors
between processes.  And the chroot can be done using /proc/self/fd/N
after receiving a descriptor.

This is because file descriptors, and current->fs->pwd and
current->fs->root, record the vfsmnt as well as the dentry that they
opened.

So no new system calls are needed.  A daemon to hand out per-user
namespaces (or any other policy) can be written using existing
kernels, and those namespaces can be joined using chroot.

That's the theory anyway.  It's always possible I misread the code (as
I don't use namespaces and don't have tools handy to try them).

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 13:46                 ` filesystem transactions API Ville Herva
@ 2005-04-26 14:14                   ` Jamie Lokier
  2005-04-26 14:22                     ` Artem B. Bityuckiy
  2005-04-26 14:25                   ` Trond Myklebust
  1 sibling, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 14:14 UTC (permalink / raw)
  To: Ville Herva; +Cc: linux-fsdevel, linux-kernel

Ville Herva wrote:
> Apparently, Windows Longhorn will include something called "transactional
> NTFS". It's explained pretty well in
> 
>    http://blogs.msdn.com/because_we_can/
> 
> Basically, a process can create a fs transaction, and all fs changes made
> between start of the transaction and commit are atomical - meaning nothing
> is visible until commit, and if commit fails, everything is rolled back.
> 
> Sound useful... Although there are no service pack installs that could fail
> in Linux, the same thing could be useful in rpm, yum, almost anything. 
> 
> What do you think?

I think I've wanted something like that for _years_ in unix.

It's an old, old idea, and I've often wondered why we haven't implemented it.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:14                   ` Jamie Lokier
@ 2005-04-26 14:22                     ` Artem B. Bityuckiy
  2005-04-26 14:32                       ` Jamie Lokier
                                         ` (2 more replies)
  0 siblings, 3 replies; 95+ messages in thread
From: Artem B. Bityuckiy @ 2005-04-26 14:22 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ville Herva, linux-fsdevel, linux-kernel

Jamie Lokier wrote:
> I think I've wanted something like that for _years_ in unix.
> 
> It's an old, old idea, and I've often wondered why we haven't implemented it.
> 

I thought it is possible to rather easily to implement this on top
of non-transactional FS (albeit I didn't try) and there is no need
to overcomplicate an FS. Just implement a specialized user-space
library and utilize it.


-- 
Best regards, Artem B. Bityuckiy
Oktet Labs (St. Petersburg), Software Engineer.
+78124286709 (office) +79112449030 (mobile)
E-mail: dedekind@oktetlabs.ru, web: http://www.oktetlabs.ru

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 13:46                 ` filesystem transactions API Ville Herva
  2005-04-26 14:14                   ` Jamie Lokier
@ 2005-04-26 14:25                   ` Trond Myklebust
  1 sibling, 0 replies; 95+ messages in thread
From: Trond Myklebust @ 2005-04-26 14:25 UTC (permalink / raw)
  To: Ville Herva; +Cc: Linux Filesystem Development, linux-kernel

ty den 26.04.2005 Klokka 16:46 (+0300) skreiv Ville Herva:
> Apparently, Windows Longhorn will include something called "transactional
> NTFS". It's explained pretty well in
> 
>    http://blogs.msdn.com/because_we_can/
> 
> Basically, a process can create a fs transaction, and all fs changes made
> between start of the transaction and commit are atomical - meaning nothing
> is visible until commit, and if commit fails, everything is rolled back.
> 
> Sound useful... Although there are no service pack installs that could fail
> in Linux, the same thing could be useful in rpm, yum, almost anything. 
> 
> What do you think?

NetApp have implemented something similar in their DAFS filesystem
called "rollback locks" (or autorecover locks).

   http://www.watersprings.org/pub/id/draft-wittle-dafs-00.txt

Very useful for database apps etc.

Cheers,
  Trond
-- 
Trond Myklebust <trond.myklebust@fys.uio.no>


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:22                     ` Artem B. Bityuckiy
@ 2005-04-26 14:32                       ` Jamie Lokier
  2005-04-26 14:46                         ` Artem B. Bityuckiy
  2005-04-26 15:01                         ` John Stoffel
  2005-04-26 15:40                       ` Charles P. Wright
  2005-04-27 13:36                       ` Andi Kleen
  2 siblings, 2 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 14:32 UTC (permalink / raw)
  To: Artem B. Bityuckiy; +Cc: Ville Herva, linux-fsdevel, linux-kernel

Artem B. Bityuckiy wrote:
> Jamie Lokier wrote:
> >I think I've wanted something like that for _years_ in unix.
> >
> >It's an old, old idea, and I've often wondered why we haven't implemented 
> >it.
> >
> 
> I thought it is possible to rather easily to implement this on top
> of non-transactional FS (albeit I didn't try) and there is no need
> to overcomplicate an FS. Just implement a specialized user-space
> library and utilize it.

No.  A transaction means that _all_ processes will see the whole
transaction or not.

It does _not_ mean that only a subset of programs, which happen to
link with a particular user-space library, will see it or not.

For example, you can use transactions for distro package management: a
whole update of a package would be a single transaction, so that at no
time does any program see an inconsistent set of files.  See why
_every_ process in the system must have the same view?

[ If you meant that you can implement it with a user-space library
that every process in the system links to, that's true.  But it would
rather misses the point of having filesystems in the kernel at all :) ]

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:32                       ` Jamie Lokier
@ 2005-04-26 14:46                         ` Artem B. Bityuckiy
  2005-04-26 15:19                           ` Jamie Lokier
  2005-04-26 15:01                         ` John Stoffel
  1 sibling, 1 reply; 95+ messages in thread
From: Artem B. Bityuckiy @ 2005-04-26 14:46 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ville Herva, linux-fsdevel, linux-kernel

Jamie Lokier wrote:
> Artem B. Bityuckiy wrote:
> 
> No.  A transaction means that _all_ processes will see the whole
> transaction or not.
> 
> It does _not_ mean that only a subset of programs, which happen to
> link with a particular user-space library, will see it or not.
> 
> For example, you can use transactions for distro package management: a
> whole update of a package would be a single transaction, so that at no
> time does any program see an inconsistent set of files.  See why
> _every_ process in the system must have the same view?
> 
> [ If you meant that you can implement it with a user-space library
> that every process in the system links to, that's true.  But it would
> rather misses the point of having filesystems in the kernel at all :) ]
> 
Hmm, so the whole point to implement transactions in the kernel space is 
to do the transactions in a way that nobody can see any intermediate 
inconsistent state ?


-- 
Best regards, Artem B. Bityuckiy
Oktet Labs (St. Petersburg), Software Engineer.
+78124286709 (office) +79112449030 (mobile)
E-mail: dedekind@oktetlabs.ru, web: http://www.oktetlabs.ru

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:32                       ` Jamie Lokier
  2005-04-26 14:46                         ` Artem B. Bityuckiy
@ 2005-04-26 15:01                         ` John Stoffel
  2005-04-26 15:12                           ` Lars Marowsky-Bree
                                             ` (2 more replies)
  1 sibling, 3 replies; 95+ messages in thread
From: John Stoffel @ 2005-04-26 15:01 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Artem B. Bityuckiy, Ville Herva, linux-fsdevel, linux-kernel

>>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:

Jamie> No.  A transaction means that _all_ processes will see the
Jamie> whole transaction or not.

This is really hard.  How do you handle the case where process X
starts a transaction modifies files a, b & c, but process Y has file b
open for writing, and never lets it go?  Or the file gets unlinked?  

Jamie> For example, you can use transactions for distro package
Jamie> management: a whole update of a package would be a single
Jamie> transaction, so that at no time does any program see an
Jamie> inconsistent set of files.  See why _every_ process in the
Jamie> system must have the same view?

What about programs that are already open and running?  

It might be doable in some sense, but I can see that details are
really hard to get right.  Esp without breaking existing Unix
semantics.  

But then again, I could be smoking something good (or bad :-) here, so
take what I say with a grain of salt.

John

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:01                         ` John Stoffel
@ 2005-04-26 15:12                           ` Lars Marowsky-Bree
  2005-04-26 15:19                           ` Trond Myklebust
  2005-04-26 15:24                           ` Jamie Lokier
  2 siblings, 0 replies; 95+ messages in thread
From: Lars Marowsky-Bree @ 2005-04-26 15:12 UTC (permalink / raw)
  To: John Stoffel, Jamie Lokier
  Cc: Artem B. Bityuckiy, Ville Herva, linux-fsdevel, linux-kernel

On 2005-04-26T11:01:54, John Stoffel <john@stoffel.org> wrote:

> Jamie> No.  A transaction means that _all_ processes will see the
> Jamie> whole transaction or not.
> This is really hard.  How do you handle the case where process X
> starts a transaction modifies files a, b & c, but process Y has file b
> open for writing, and never lets it go?  Or the file gets unlinked?  

I suggest you ask Hans, reiser4 does have such a feature if I recall
correctly.

It gets a whole lot more interesting if you want the sucker to spawn
more than one mount though.


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:46                         ` Artem B. Bityuckiy
@ 2005-04-26 15:19                           ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 15:19 UTC (permalink / raw)
  To: Artem B. Bityuckiy; +Cc: Ville Herva, linux-fsdevel, linux-kernel

Artem B. Bityuckiy wrote:
> Hmm, so the whole point to implement transactions in the kernel space is 
> to do the transactions in a way that nobody can see any intermediate 
> inconsistent state ?

Yes.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:01                         ` John Stoffel
  2005-04-26 15:12                           ` Lars Marowsky-Bree
@ 2005-04-26 15:19                           ` Trond Myklebust
  2005-04-26 15:29                             ` Ritesh Kumar
  2005-04-26 15:47                             ` Jamie Lokier
  2005-04-26 15:24                           ` Jamie Lokier
  2 siblings, 2 replies; 95+ messages in thread
From: Trond Myklebust @ 2005-04-26 15:19 UTC (permalink / raw)
  To: John Stoffel
  Cc: Jamie Lokier, Artem B. Bityuckiy, Ville Herva,
	Linux Filesystem Development, linux-kernel

ty den 26.04.2005 Klokka 11:01 (-0400) skreiv John Stoffel:
> >>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
> 
> Jamie> No.  A transaction means that _all_ processes will see the
> Jamie> whole transaction or not.
> 
> This is really hard.  How do you handle the case where process X
> starts a transaction modifies files a, b & c, but process Y has file b
> open for writing, and never lets it go?  Or the file gets unlinked?  

That is why implementing it as a form of lock makes sense.

> Jamie> For example, you can use transactions for distro package
> Jamie> management: a whole update of a package would be a single
> Jamie> transaction, so that at no time does any program see an
> Jamie> inconsistent set of files.  See why _every_ process in the
> Jamie> system must have the same view?
> 
> What about programs that are already open and running?  
> 
> It might be doable in some sense, but I can see that details are
> really hard to get right.  Esp without breaking existing Unix
> semantics.  

Wrong.

Cheers,
  Trond
-- 
Trond Myklebust <trond.myklebust@fys.uio.no>


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:01                         ` John Stoffel
  2005-04-26 15:12                           ` Lars Marowsky-Bree
  2005-04-26 15:19                           ` Trond Myklebust
@ 2005-04-26 15:24                           ` Jamie Lokier
  2005-04-26 17:22                             ` Diego Calleja
  2005-04-27  9:34                             ` Jan Hudec
  2 siblings, 2 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 15:24 UTC (permalink / raw)
  To: John Stoffel; +Cc: Artem B. Bityuckiy, Ville Herva, linux-fsdevel, linux-kernel

John Stoffel wrote:
> >>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
> 
> Jamie> No.  A transaction means that _all_ processes will see the
> Jamie> whole transaction or not.
> 
> This is really hard.  How do you handle the case where process X
> starts a transaction modifies files a, b & c, but process Y has file b
> open for writing, and never lets it go?  Or the file gets unlinked?  

Then it starts to depend on what kind of transactions you want to
implement.

You can say that a transaction isn't allowed when a process has one of
the files opened for writing.  Or you can say a transaction is
equivalent to calling all of the I/O system calls at once.  You can
also decide if you want the reads and directory lookups performed in
the transactions to become prerequisites for the transaction
completing (so it's aborted if another process writes to those file
regions or changes the directory structure in a way which breaks a
prerequisite), or if you want those to lock the things which are read
for the duration of the transaction, or even just ignore reads for
transaction purposes.  Or, you can say that transactions are limited
to just directory structure, and not file contents (that's good enough
for package management), or you can say they're limited to just file
contents (that's good enough for databases and text file edits).

Etc, etc, quite a lot of semantic choices.

> What about programs that are already open and running?  
> 
> It might be doable in some sense, but I can see that details are
> really hard to get right.  Esp without breaking existing Unix
> semantics.  

It's even harder without kernel support! :)

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:19                           ` Trond Myklebust
@ 2005-04-26 15:29                             ` Ritesh Kumar
  2005-04-26 15:50                               ` Jamie Lokier
                                                 ` (2 more replies)
  2005-04-26 15:47                             ` Jamie Lokier
  1 sibling, 3 replies; 95+ messages in thread
From: Ritesh Kumar @ 2005-04-26 15:29 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: John Stoffel, Jamie Lokier, Artem B. Bityuckiy, Ville Herva,
	Linux Filesystem Development

On 4/26/05, Trond Myklebust <trond.myklebust@fys.uio.no> wrote:
> ty den 26.04.2005 Klokka 11:01 (-0400) skreiv John Stoffel:
> > >>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
> >
> > Jamie> No.  A transaction means that _all_ processes will see the
> > Jamie> whole transaction or not.
> >
> > This is really hard.  How do you handle the case where process X
> > starts a transaction modifies files a, b & c, but process Y has file b
> > open for writing, and never lets it go?  Or the file gets unlinked?
> 
> That is why implementing it as a form of lock makes sense.
> 
> > Jamie> For example, you can use transactions for distro package
> > Jamie> management: a whole update of a package would be a single
> > Jamie> transaction, so that at no time does any program see an
> > Jamie> inconsistent set of files.  See why _every_ process in the
> > Jamie> system must have the same view?
> >
> > What about programs that are already open and running?
> >
> > It might be doable in some sense, but I can see that details are
> > really hard to get right.  Esp without breaking existing Unix
> > semantics.
> 
> Wrong.
> 
> Cheers,
>   Trond
> --
> Trond Myklebust <trond.myklebust@fys.uio.no>
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

I was wondering if it was also important to commit the changes to
persistent storage once the transaction has been completed or is it
just important that all processes see the changes atomically. I mean,
is it also important that before any of the processes see the result
of the transaction, or before the 'transacting' process knows the
transaction is complete, the changes must be flushed to persistent
storage? That would be all the more reason to implement it in kernel
space...

Ritesh
P.S. Sorry... I missed the reply_all button for the first mail.
-- 
http://www.cs.unc.edu/~ritesh/

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:22                     ` Artem B. Bityuckiy
  2005-04-26 14:32                       ` Jamie Lokier
@ 2005-04-26 15:40                       ` Charles P. Wright
  2005-04-26 16:07                         ` Artem B. Bityuckiy
  2005-04-27  9:37                         ` Lars Marowsky-Bree
  2005-04-27 13:36                       ` Andi Kleen
  2 siblings, 2 replies; 95+ messages in thread
From: Charles P. Wright @ 2005-04-26 15:40 UTC (permalink / raw)
  To: Artem B. Bityuckiy; +Cc: Jamie Lokier, Ville Herva, linux-fsdevel, linux-kernel

On Tue, 2005-04-26 at 18:22 +0400, Artem B. Bityuckiy wrote:
> Jamie Lokier wrote:
> > I think I've wanted something like that for _years_ in unix.
> > 
> > It's an old, old idea, and I've often wondered why we haven't implemented it.
> > 
> 
> I thought it is possible to rather easily to implement this on top
> of non-transactional FS (albeit I didn't try) and there is no need
> to overcomplicate an FS. Just implement a specialized user-space
> library and utilize it.
There are actually plenty of things that make it harder than it first
seems to provide ACID transactions.  The two most difficult things are
going to be atomicity and isolation.

Atomicity is difficult, because you have lots of caches each with their
own bits of state (e.g., the inode/dentry caches).  Assuming your
transaction is committed that isn't so much of a problem, but once you
have on rollback you need to undo any changes to those caches.

Isolation (this is the property that says that concurrent transactions
should be the same as if there was a serial execution) is also tricky to
get right.  A transaction can touch any number of objects, and user-
applications may not respect any lock ordering --- which means you will
have deadlocks, and you must detect and resolve them (probably by
aborting one of the transactions).

None of these problems are insurmountable, and there are definitely good
reasons to use transactions.  For example, RPM uses transactions to
update its own databases, it would be great if it could use transactions
to update the whole file system.  Mail servers also have to go through
hoops to provide atomic updates.  Isolation takes care of race
conditions.

At our lab, we've been experimenting with transactional file systems.
We've ported the Berkeley database to the kernel, because it already
provides ACID transactions.  We've also built a simple file system on
top of it, with a rudimentary transactions API that is exposed to user-
level.  One of the key things that we've learned is that it isn't very
easy to just "bolt" transactions onto your file system after the fact,
because there are just so many interactions between the file system,
caches, and the transaction manager.

Charles

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:19                           ` Trond Myklebust
  2005-04-26 15:29                             ` Ritesh Kumar
@ 2005-04-26 15:47                             ` Jamie Lokier
  2005-04-26 15:51                               ` Artem B. Bityuckiy
  1 sibling, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 15:47 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: John Stoffel, Artem B. Bityuckiy, Ville Herva,
	Linux Filesystem Development, linux-kernel

Trond Myklebust wrote:
> > Jamie> No.  A transaction means that _all_ processes will see the
> > Jamie> whole transaction or not.
> > 
> > This is really hard.  How do you handle the case where process X
> > starts a transaction modifies files a, b & c, but process Y has file b
> > open for writing, and never lets it go?  Or the file gets unlinked?  
> 
> That is why implementing it as a form of lock makes sense.

The problem with making them exclusive locks is that you halt the
system for the duration of the transaction.  If it's a big transaction
such as updating 1000 files for a package update, that blocks a lot of
programs for a long time, and it's not necessary.

And, because that's a potential denial of service, you have to limit
the size of transactions and their duration, especially for ordinary
users.  That makes transactions a lot less useful than they can be.

I would implement them as a combination of time-limited lock, and
abortable transaction with file & directory reads establishing
prerequisites.

While the transaction lock is held, everything read (i.e. read byte
ranges, lock byte ranges, directory lookups, and stat results) cause
the corresponding range or inode to be exclusively locked for this
transaction, and also cause them to be recorded in the prerequisite
set for this transaction.  Everything written (i.e. byte ranges or any
other filesystem modifying operation) is queued.

If the transaction lock timeout is reached before the transaction is
closed, all the exlusive locks for this transaction are released, and
the transaction lock itself is released, and the prerequisite set
continues to be recorded.

If at any time, another process tries to modify any of the information
in the transaction's prerequisite set, then firstly: if the
transaction lock is held, the other process is blocked until that lock
is released.  Secondly: if the other process successfully modifies
information in the transaction's prerequisite set, the transaction is
aborted.  All further operations in this transaction will fail,
including reads, writes, and the final close which commits writes.

Finally, when the transaction is closed, either it fails because
prerequisites were modified, or it commits all the pending filesystem
modifications of this transaction.

Why two phases?

The second phase, with no exclusive locking, is to allow ordinary
users to use transactions without blocking other processes or hogging
excessive system resources.  It allows other processes to progress
while a big transaction is in progress.  In other words, it prevents
some kinds of denial-of-service, allows arbitrarily large transactions
as long as there's enough space in the filesystem, and is generally
better.

The first phase, with exlusive locking, uses a randomised timeout for
the lock.  This is to prevent starvation of transacting processes by
other processes.  It's analogous to the problem of readers starving
writers in some kinds of read-write locks.  The randomised timeout is
to prevent mutual starvation between two or more transacting
processes, which might otherwise get into synchronised livelock.

Enjoy :)
-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:29                             ` Ritesh Kumar
@ 2005-04-26 15:50                               ` Jamie Lokier
  2005-04-26 16:44                               ` Trond Myklebust
  2005-04-26 22:44                               ` Bryan Henderson
  2 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 15:50 UTC (permalink / raw)
  To: ritesh
  Cc: Trond Myklebust, John Stoffel, Artem B. Bityuckiy, Ville Herva,
	Linux Filesystem Development

Ritesh Kumar wrote:
> I was wondering if it was also important to commit the changes to
> persistent storage once the transaction has been completed or is it
> just important that all processes see the changes atomically. I mean,
> is it also important that before any of the processes see the result
> of the transaction, or before the 'transacting' process knows the
> transaction is complete, the changes must be flushed to persistent
> storage? That would be all the more reason to implement it in kernel
> space...

They are different kinds of transaction, that's all.  The kernel could
provide either kind, or both.

A logical way to offer both would be to include
fsync/fdatasync/fdatasync_range operations in a transaction.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:47                             ` Jamie Lokier
@ 2005-04-26 15:51                               ` Artem B. Bityuckiy
  2005-04-26 15:56                                 ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Artem B. Bityuckiy @ 2005-04-26 15:51 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Trond Myklebust, John Stoffel, Ville Herva,
	Linux Filesystem Development, linux-kernel

Jamie Lokier wrote:
> The problem with making them exclusive locks is that you halt the
> system for the duration of the transaction.  If it's a big transaction
> such as updating 1000 files for a package update, that blocks a lot of
> programs for a long time, and it's not necessary.

Surely we'll anyway block others if we have a kernel-level transaction 
support?
What is the difference in which layer to block?

-- 
Best regards, Artem B. Bityuckiy
Oktet Labs (St. Petersburg), Software Engineer.
+78124286709 (office) +79112449030 (mobile)
E-mail: dedekind@oktetlabs.ru, web: http://www.oktetlabs.ru

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:51                               ` Artem B. Bityuckiy
@ 2005-04-26 15:56                                 ` Jamie Lokier
  2005-04-26 16:01                                   ` Artem B. Bityuckiy
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 15:56 UTC (permalink / raw)
  To: Artem B. Bityuckiy
  Cc: Trond Myklebust, John Stoffel, Ville Herva,
	Linux Filesystem Development, linux-kernel

Artem B. Bityuckiy wrote:
> Jamie Lokier wrote:
> >The problem with making them exclusive locks is that you halt the
> >system for the duration of the transaction.  If it's a big transaction
> >such as updating 1000 files for a package update, that blocks a lot of
> >programs for a long time, and it's not necessary.
> 
> Surely we'll anyway block others if we have a kernel-level
> transaction support?  What is the difference in which layer to
> block?

No.  Why would you block?  You can have transactions without blocking
other processes.

When updating, say, the core-utils package (which contains cat),
there's no reason why a program which executes "cat" should have to
block during the update.  It can simply execute the old one until the
new one is committed at the end of the update.

It's analogous to RCU for protecting kernel data structures without
blocking readers.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:56                                 ` Jamie Lokier
@ 2005-04-26 16:01                                   ` Artem B. Bityuckiy
  2005-04-27  9:14                                     ` Jan Hudec
  0 siblings, 1 reply; 95+ messages in thread
From: Artem B. Bityuckiy @ 2005-04-26 16:01 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Trond Myklebust, John Stoffel, Ville Herva,
	Linux Filesystem Development, linux-kernel

Jamie Lokier wrote:
> No.  Why would you block?  You can have transactions without blocking
> other processes.
> 
> When updating, say, the core-utils package (which contains cat),
> there's no reason why a program which executes "cat" should have to
> block during the update.  It can simply execute the old one until the
> new one is committed at the end of the update.
> 
> It's analogous to RCU for protecting kernel data structures without
> blocking readers.
> 
Hmm, can't we implement a user-space locking system which admits of 
readers during transactions? I gues we can.

-- 
Best regards, Artem B. Bityuckiy
Oktet Labs (St. Petersburg), Software Engineer.
+78124286709 (office) +79112449030 (mobile)
E-mail: dedekind@oktetlabs.ru, web: http://www.oktetlabs.ru

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:40                       ` Charles P. Wright
@ 2005-04-26 16:07                         ` Artem B. Bityuckiy
  2005-04-26 17:22                           ` Charles P. Wright
  2005-04-27  9:37                         ` Lars Marowsky-Bree
  1 sibling, 1 reply; 95+ messages in thread
From: Artem B. Bityuckiy @ 2005-04-26 16:07 UTC (permalink / raw)
  To: Charles P. Wright; +Cc: Jamie Lokier, Ville Herva, linux-fsdevel, linux-kernel

Charles P. Wright wrote:
> Atomicity is difficult, because you have lots of caches each with their
> own bits of state (e.g., the inode/dentry caches).  Assuming your
> transaction is committed that isn't so much of a problem, but once you
> have on rollback you need to undo any changes to those caches.
I guess if you do synchronization before unlocking all is OK. Roll-back 
means deleting partially written things and restore old things, then run 
fsyncs. Whys this may be not enough?

-- 
Best regards, Artem B. Bityuckiy
Oktet Labs (St. Petersburg), Software Engineer.
+78124286709 (office) +79112449030 (mobile)
E-mail: dedekind@oktetlabs.ru, web: http://www.oktetlabs.ru

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:29                             ` Ritesh Kumar
  2005-04-26 15:50                               ` Jamie Lokier
@ 2005-04-26 16:44                               ` Trond Myklebust
  2005-04-26 22:44                               ` Bryan Henderson
  2 siblings, 0 replies; 95+ messages in thread
From: Trond Myklebust @ 2005-04-26 16:44 UTC (permalink / raw)
  To: ritesh
  Cc: John Stoffel, Jamie Lokier, Artem B. Bityuckiy, Ville Herva,
	Linux Filesystem Development

ty den 26.04.2005 Klokka 11:29 (-0400) skreiv Ritesh Kumar:

> I was wondering if it was also important to commit the changes to
> persistent storage once the transaction has been completed or is it
> just important that all processes see the changes atomically. I mean,
> is it also important that before any of the processes see the result
> of the transaction, or before the 'transacting' process knows the
> transaction is complete, the changes must be flushed to persistent
> storage? That would be all the more reason to implement it in kernel
> space...

If you want it to be useful for database-like applications, then the
answer is that yes it should be possible to guarantee this. The whole
point for most applications is that the on-disk file should always end
up in a fully consistent and/or recoverable state no matter what happens
to the machine itself (power failure, disk failure, application
failure,...).

That sort of thing is relatively easy to do for file data in user space
(databases do it all the time). In the kernel, a filesystem can extend
this to also manage its metadata, and it might perhaps be able to
piggyback some of the rollback features onto its journalling system. I
suspect that the common Linux journalling filesystems probably couldn't,
though, since they all use update-in-place schemes.

Cheers,
  Trond
-- 
Trond Myklebust <trond.myklebust@fys.uio.no>

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:24                           ` Jamie Lokier
@ 2005-04-26 17:22                             ` Diego Calleja
  2005-04-26 17:38                               ` Jamie Lokier
  2005-04-27  9:34                             ` Jan Hudec
  1 sibling, 1 reply; 95+ messages in thread
From: Diego Calleja @ 2005-04-26 17:22 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: john, dedekind, v, linux-fsdevel, linux-kernel

El Tue, 26 Apr 2005 16:24:34 +0100,
Jamie Lokier <jamie@shareable.org> escribió:

> It's even harder without kernel support! :)

This seems to implement something in userspace which might be interesting:
http://users.auriga.wearlab.de/~alb/libjio/
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 16:07                         ` Artem B. Bityuckiy
@ 2005-04-26 17:22                           ` Charles P. Wright
  0 siblings, 0 replies; 95+ messages in thread
From: Charles P. Wright @ 2005-04-26 17:22 UTC (permalink / raw)
  To: Artem B. Bityuckiy; +Cc: Jamie Lokier, Ville Herva, linux-fsdevel, linux-kernel

On Tue, 2005-04-26 at 20:07 +0400, Artem B. Bityuckiy wrote:
> Charles P. Wright wrote:
> > Atomicity is difficult, because you have lots of caches each with their
> > own bits of state (e.g., the inode/dentry caches).  Assuming your
> > transaction is committed that isn't so much of a problem, but once you
> > have on rollback you need to undo any changes to those caches.
> I guess if you do synchronization before unlocking all is OK. Roll-back 
> means deleting partially written things and restore old things, then run 
> fsyncs. Whys this may be not enough?
That would be fine for the on-disk image of the file system, but the in-
memory image also needs to be handled.  Keeping track of all of these
objects and their changes is not a simple task.

Charles


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 17:22                             ` Diego Calleja
@ 2005-04-26 17:38                               ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-26 17:38 UTC (permalink / raw)
  To: Diego Calleja; +Cc: john, dedekind, v, linux-fsdevel, linux-kernel

Diego Calleja wrote:
> > It's even harder without kernel support! :)
> 
> This seems to implement something in userspace which might be interesting:
> http://users.auriga.wearlab.de/~alb/libjio/

Thanks.  That looks like a handy little library.

It doesn't do full filesystem transactions, obviously.  Just
transactions within a single file, and requiring all processes using
the file to cooperate.

-- Jamie


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:29                             ` Ritesh Kumar
  2005-04-26 15:50                               ` Jamie Lokier
  2005-04-26 16:44                               ` Trond Myklebust
@ 2005-04-26 22:44                               ` Bryan Henderson
  2 siblings, 0 replies; 95+ messages in thread
From: Bryan Henderson @ 2005-04-26 22:44 UTC (permalink / raw)
  To: ritesh, Artem B. Bityuckiy, Jamie Lokier, John Stoffel,
	Linux Filesystem Development, linux-fsdevel-owner,
	Trond Myklebust, Ville Herva

Also, as long as we're dreaming, to do the whole job in today's world, you 
would want a transaction to include work done across multiple kernels. 
E.g. if you're moving a file from one network filesystem to another, you'd 
want the move to be atomic.

I used a transactional filesystem like that briefly in the early '90s.  It 
was the brand new SFS for IBM's VM.  It was a marvelous piece of work that 
actually started my interest in filesystems.  It's too bad it was on VM, 
since VM was not the platform people were choosing for new things.

I heard the same thing was also in IBM's OS/2, though.  At least there it 
had a chance of fluorishing, but that ship sank for other reasons.

--
Bryan Henderson                          IBM Almaden Research Center
San Jose CA                              Filesystems

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 16:01                                   ` Artem B. Bityuckiy
@ 2005-04-27  9:14                                     ` Jan Hudec
  0 siblings, 0 replies; 95+ messages in thread
From: Jan Hudec @ 2005-04-27  9:14 UTC (permalink / raw)
  To: Artem B. Bityuckiy
  Cc: Jamie Lokier, Trond Myklebust, John Stoffel, Ville Herva,
	Linux Filesystem Development, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 992 bytes --]

On Tue, Apr 26, 2005 at 20:01:45 +0400, Artem B. Bityuckiy wrote:
> Jamie Lokier wrote:
> >No.  Why would you block?  You can have transactions without blocking
> >other processes.
> >
> >When updating, say, the core-utils package (which contains cat),
> >there's no reason why a program which executes "cat" should have to
> >block during the update.  It can simply execute the old one until the
> >new one is committed at the end of the update.
> >
> >It's analogous to RCU for protecting kernel data structures without
> >blocking readers.
> >
> Hmm, can't we implement a user-space locking system which admits of 
> readers during transactions? I gues we can.

The problem with implementing in userland, as was already said in the
thread, is, that if some process does not use the library, it can
completely mess it up. It is only safe in kernel.

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-24 21:43               ` Jamie Lokier
  2005-04-25  7:14                 ` Jan Hudec
@ 2005-04-27  9:14                 ` Helge Hafting
  1 sibling, 0 replies; 95+ messages in thread
From: Helge Hafting @ 2005-04-27  9:14 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Al Viro, Miklos Szeredi, hch, linux-fsdevel, linux-kernel, akpm

Jamie Lokier wrote:

> He wants to do this:
>
>   1. From client, login to server and do a usermount on $HOME/private.
>
>   2. From client, login to server and read the files previously mounted.
>  
>
This is works fine with plain "mount", except that the mount isn't
hidden from others.  Why hide it?  Permissions can be used to prevent
others from looking at the mounted stuff if need be.  I.e. put
the mountpoint in a directory not readable by others, or
have the root of that fs unreadable by others.

Helge Hafting

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:24                           ` Jamie Lokier
  2005-04-26 17:22                             ` Diego Calleja
@ 2005-04-27  9:34                             ` Jan Hudec
  2005-04-27 13:43                               ` Ville Herva
  1 sibling, 1 reply; 95+ messages in thread
From: Jan Hudec @ 2005-04-27  9:34 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: John Stoffel, Artem B. Bityuckiy, Ville Herva, linux-fsdevel,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3033 bytes --]

On Tue, Apr 26, 2005 at 16:24:34 +0100, Jamie Lokier wrote:
> John Stoffel wrote:
> > >>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
> > 
> > Jamie> No.  A transaction means that _all_ processes will see the
> > Jamie> whole transaction or not.
> > 
> > This is really hard.  How do you handle the case where process X
> > starts a transaction modifies files a, b & c, but process Y has file b
> > open for writing, and never lets it go?  Or the file gets unlinked?  
> 
> Then it starts to depend on what kind of transactions you want to
> implement.
> 
> You can say that a transaction isn't allowed when a process has one of
> the files opened for writing.  Or you can say a transaction is
> equivalent to calling all of the I/O system calls at once.  You can
> also decide if you want the reads and directory lookups performed in
> the transactions to become prerequisites for the transaction
> completing (so it's aborted if another process writes to those file
> regions or changes the directory structure in a way which breaks a
> prerequisite), or if you want those to lock the things which are read
> for the duration of the transaction, or even just ignore reads for
> transaction purposes.  Or, you can say that transactions are limited
> to just directory structure, and not file contents (that's good enough
> for package management), or you can say they're limited to just file
> contents (that's good enough for databases and text file edits).
> 
> Etc, etc, quite a lot of semantic choices.

How do we specify which calls belong to a transaction? By some kind of
extra file handle?

I'd think having global per-process transaction is not the best way.
So I think we should have some kind of transaction handle (probably in
the file handle space) and a way to say that a syscall is done within
a transaction. To avoid duplicating all syscalls, we could have
set_active_transaction() operation.

Now I think the criteria for semantics should be serializability. That
would mean, that lookup paths would have to be locked IFF the lookup was
done within the transaction -- but you would be free to open a file
without transaction, then set_active_transaction and write that file.
That way the write would become atomic, but someone else could freely
rename the file from under you.

Note: Editors currently write to a temporary file and rename over the
original (if they have permissions to do it), which is as good
transaction as they need.

> > What about programs that are already open and running?  
> > 
> > It might be doable in some sense, but I can see that details are
> > really hard to get right.  Esp without breaking existing Unix
> > semantics.  
> 
> It's even harder without kernel support! :)

If every syscall (touching filesystem) was turned into a transaction of
it's own, it wouldn't break any semantics.

-------------------------------------------------------------------------------
						 Jan 'Bulb' Hudec <bulb@ucw.cz>

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 15:40                       ` Charles P. Wright
  2005-04-26 16:07                         ` Artem B. Bityuckiy
@ 2005-04-27  9:37                         ` Lars Marowsky-Bree
  1 sibling, 0 replies; 95+ messages in thread
From: Lars Marowsky-Bree @ 2005-04-27  9:37 UTC (permalink / raw)
  To: Charles P. Wright, Artem B. Bityuckiy
  Cc: Jamie Lokier, Ville Herva, linux-fsdevel, linux-kernel

On 2005-04-26T11:40:02, "Charles P. Wright" <cwright@cs.sunysb.edu> wrote:

> Atomicity is difficult, because you have lots of caches each with their
> own bits of state (e.g., the inode/dentry caches).  Assuming your
> transaction is committed that isn't so much of a problem, but once you
> have on rollback you need to undo any changes to those caches.
> 
> Isolation (this is the property that says that concurrent transactions
> should be the same as if there was a serial execution) is also tricky to
> get right.  A transaction can touch any number of objects, and user-
> applications may not respect any lock ordering --- which means you will
> have deadlocks, and you must detect and resolve them (probably by
> aborting one of the transactions).

Just as a weird idea, spawned by the FUSE thread.

"Transactions happen in their own namespace".

Besides having a namespace_(create|join) as needed for FUSE (or
similar), there'd be a privileged namespace_replace(target, source) (or
_merge, if you prefer - that however seems to imply that a namespace was
actually forked off another).

So, you want transactions for testing some software update, you create
your new one, mount stuff, do the update, and then "commit" it by
replacing the global namespace by it.

If you want to discard, just exit it. As soon as no further references
to a namespace exist, it can be cleaned up (and non-persistent
transactions will be 'unrolled' and thrown away).

Now where's that pipe of mine... ;-)

Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-26 14:22                     ` Artem B. Bityuckiy
  2005-04-26 14:32                       ` Jamie Lokier
  2005-04-26 15:40                       ` Charles P. Wright
@ 2005-04-27 13:36                       ` Andi Kleen
  2 siblings, 0 replies; 95+ messages in thread
From: Andi Kleen @ 2005-04-27 13:36 UTC (permalink / raw)
  To: Artem B. Bityuckiy; +Cc: Ville Herva, linux-fsdevel, linux-kernel, jamie

"Artem B. Bityuckiy" <dedekind@oktetlabs.ru> writes:

> Jamie Lokier wrote:
>> I think I've wanted something like that for _years_ in unix.
>> It's an old, old idea, and I've often wondered why we haven't
>> implemented it.
>>
>
> I thought it is possible to rather easily to implement this on top
> of non-transactional FS (albeit I didn't try) and there is no need
> to overcomplicate an FS. Just implement a specialized user-space
> library and utilize it.

Yes it is. e.g. newer sleepycat DB has a nice library for this.
It should be somewhere on your distribution.

-Andi

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-27  9:34                             ` Jan Hudec
@ 2005-04-27 13:43                               ` Ville Herva
  2005-04-27 15:17                                 ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Ville Herva @ 2005-04-27 13:43 UTC (permalink / raw)
  To: Jan Hudec
  Cc: Jamie Lokier, John Stoffel, Artem B. Bityuckiy, linux-fsdevel,
	linux-kernel

On Wed, Apr 27, 2005 at 11:34:12AM +0200, you [Jan Hudec] wrote:
> On Tue, Apr 26, 2005 at 16:24:34 +0100, Jamie Lokier wrote:
> > John Stoffel wrote:
> > > >>>>> "Jamie" == Jamie Lokier <jamie@shareable.org> writes:
> > > 
> > > Jamie> No.  A transaction means that _all_ processes will see the
> > > Jamie> whole transaction or not.
> > > 
> > > This is really hard.  How do you handle the case where process X
> > > starts a transaction modifies files a, b & c, but process Y has file b
> > > open for writing, and never lets it go?  Or the file gets unlinked?  
> > 
> > Then it starts to depend on what kind of transactions you want to
> > implement.
> > 
> > You can say that a transaction isn't allowed when a process has one of
> > the files opened for writing.  Or you can say a transaction is
> > equivalent to calling all of the I/O system calls at once.  You can
> > also decide if you want the reads and directory lookups performed in
> > the transactions to become prerequisites for the transaction
> > completing (so it's aborted if another process writes to those file
> > regions or changes the directory structure in a way which breaks a
> > prerequisite), or if you want those to lock the things which are read
> > for the duration of the transaction, or even just ignore reads for
> > transaction purposes.  Or, you can say that transactions are limited
> > to just directory structure, and not file contents (that's good enough
> > for package management), or you can say they're limited to just file
> > contents (that's good enough for databases and text file edits).
> > 
> > Etc, etc, quite a lot of semantic choices.
> 
> How do we specify which calls belong to a transaction? By some kind of
> extra file handle?
> 
> I'd think having global per-process transaction is not the best way.
> So I think we should have some kind of transaction handle (probably in
> the file handle space) and a way to say that a syscall is done within
> a transaction. To avoid duplicating all syscalls, we could have
> set_active_transaction() operation.

That's more or less what NTFS does. See the example at
http://blogs.msdn.com/because_we_can/
 


-- v -- 

v@iki.fi


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: filesystem transactions API
  2005-04-27 13:43                               ` Ville Herva
@ 2005-04-27 15:17                                 ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-27 15:17 UTC (permalink / raw)
  To: Ville Herva
  Cc: Jan Hudec, John Stoffel, Artem B. Bityuckiy, linux-fsdevel,
	linux-kernel

Ville Herva wrote:
> > How do we specify which calls belong to a transaction? By some kind of
> > extra file handle?
> > 
> > I'd think having global per-process transaction is not the best way.
> > So I think we should have some kind of transaction handle (probably in
> > the file handle space) and a way to say that a syscall is done within
> > a transaction. To avoid duplicating all syscalls, we could have
> > set_active_transaction() operation.
> 
> That's more or less what NTFS does. See the example at
> http://blogs.msdn.com/because_we_can/

That's the obvious choice but it limits the usefulness quite a lot.

If we have transactions, then I'd like to be able to do this from a shell:

    transaction_open t

    tar xvpSfz blahblah.tar.gz
    cd blahblah
    patch -p1 -E < foo.patch
    # etc.

    transaction_close $t

I'd also like to write inside a single C program:

    transaction * t = transaction_open ();

    /* Ordinary complicated filesystem operations here... */
    link (a, b);
    rename (c, d);
    read, write, stat etc.
    conf = open ("/etc/blahblah.conf", O_RDONLY);
    read (conf, ...)
    close (conf);
    /* If /etc/blahblah.conf is changed by another program during
       the transaction, the transaction is invalidated, because the
       dbm update below is dependent on what was read... */
    dbm_open (...);
    do_dbm_stuff (...);
    dbm_close (...);
    /* Whatever this command does, I'd like to include in the transaction. */
    system ("perl -pi -e 's/old_value/new_value/g' /etc/another.conf");

    transaction_close (t);

Fundamentally, if transactions are supported in the kernel then these
two usages are easy to offer:

    1. Ordinary file system calls as part of a transaction.

       This allows libraries which are not transaction-aware to be
       used, such as the dbm example above, and other things like XML
       parsers/writers.

    2. Subprocesses inherit a transaction, so a program can execute
       complex transactions by using other programs.

It's useful, and there is no good reason to disallow that.

Nonetheless, there's a need for some kind of transaction handles.  A
file descriptor representing a transaction seems like a natural fit.

Complex programs will want to have multiple transactions at the same
time: For example, any program structured using event-driven logic or
async I/O may have multiple independent state machines per thread,
each wanting to be able to have their own transactions.

This suggests a few things:

  - Transactions have a file descriptor to represent them.

  - Each thread has a "current transaction" that applies to all filesystem
    operations.

  - Concurrent threads will need their own current transactions, even
    while keeping "current directory" global to the whole process for
    POSIX reasons.  A process wide "current transaction" is too coarse.

  - Transactions should be automatically nestable: a program or
    library which uses transactions should itself be callable from a
    program or library which is using a transaction.

  - Transactions should record whether they cannot provide
    transactions for some operation that is attempted (e.g. writing to
    a file on a remote filesystem), aborting the transaction.

  - When a transaction aborts due to the actions of _another_ process
    (or thread) which is outside the transaction, that abort is an
    event which should be detectable synchronously (by polling the
    transaction fd) or asynchronously (by a signal - the SIGIO
    mechanism is fine for this).

  - An exclusive locking period should be optional, requested by a
    flag when opening the transaction.  Most usages will want the
    locking period with its default parameters.

  - Ideally, programs or mechanisms which provide alternative views of
    part of a filesystem, such as search results (Beagle), tarfs, or
    mailfs, should be able to update synchronously with transactions
    that affect whatever the view is watching, so that the view
    changes are effectively part of the transaction.  This does _not_
    mean that a transaction must wait for watchers to calculate
    anything.  It does mean a transaction must synchronously and
    simultaneously invalidate caches held by watchers during the
    atomic commit.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-26 14:07                   ` Jamie Lokier
@ 2005-04-28 13:28                     ` Eric Van Hensbergen
  2005-04-28 19:22                       ` Jamie Lokier
  2005-04-28 13:47                     ` Eric Van Hensbergen
  1 sibling, 1 reply; 95+ messages in thread
From: Eric Van Hensbergen @ 2005-04-28 13:28 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Pavel Machek, Al Viro, Miklos Szeredi, hch, linux-fsdevel,
	linux-kernel, akpm

> 
> Looking closer, I think we already have it.
> 
> It's called /proc/NNN/root.
> 
> Does chroot into /proc/NNN/root cause the chroot'ing process to adopt
> the namespace of NNN?  Looking at the code, I think it does.
> 
    ...
> 
> So no new system calls are needed.  A daemon to hand out per-user
> namespaces (or any other policy) can be written using existing
> kernels, and those namespaces can be joined using chroot.
> 
> That's the theory anyway.  It's always possible I misread the code (as
> I don't use namespaces and don't have tools handy to try them).
> 

I've been thinking about this a bit more...would you even need chroot?
(wouldn't exposing chroot functionality to a user incur additional
security risk?  I guess it would be okay as long as you were only
chrooting to one of your other process' roots?)

If you were organized about where the mounts in your private namespace
were done, you could just mount -bind them from
/proc/NNN/root/home/$USER/mnt (or something).  That requries a certain
amount of discipline in your mounts (or maybe not -- just diff
/proc/NNN/mounts to see what you are missing and bind the
differences).

         -eric

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-26 14:07                   ` Jamie Lokier
  2005-04-28 13:28                     ` Eric Van Hensbergen
@ 2005-04-28 13:47                     ` Eric Van Hensbergen
  2005-04-28 19:20                       ` Jamie Lokier
  1 sibling, 1 reply; 95+ messages in thread
From: Eric Van Hensbergen @ 2005-04-28 13:47 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Pavel Machek, Al Viro, Miklos Szeredi, hch, linux-fsdevel,
	linux-kernel, akpm

On 4/26/05, Jamie Lokier <jamie@shareable.org> wrote:
> 
> It's called /proc/NNN/root.
> 
> So no new system calls are needed.  A daemon to hand out per-user
> namespaces (or any other policy) can be written using existing
> kernels, and those namespaces can be joined using chroot.
> 
> That's the theory anyway.  It's always possible I misread the code (as
> I don't use namespaces and don't have tools handy to try them).
> 

Should have checked myself before posting my previous reply -- but
this doesn't seem to work.  /proc/NNN/root is represented as a
symlink, but when you CLONE_NS and then try to look at another one of
your process' /proc/NNN/root the link doesn't seem to have a target
and you get permission denied on all accesses.  I haven't looked at
the underlying procfs code, but adapting procfs for this sort of
purpose feels wrong.

          -eric

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-28 13:47                     ` Eric Van Hensbergen
@ 2005-04-28 19:20                       ` Jamie Lokier
  2005-04-28 19:39                         ` Ram
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-28 19:20 UTC (permalink / raw)
  To: Eric Van Hensbergen
  Cc: Pavel Machek, Al Viro, Miklos Szeredi, hch, linux-fsdevel,
	linux-kernel, akpm

Eric Van Hensbergen wrote:
> > It's called /proc/NNN/root.
> > 
> > So no new system calls are needed.  A daemon to hand out per-user
> > namespaces (or any other policy) can be written using existing
> > kernels, and those namespaces can be joined using chroot.
> > 
> > That's the theory anyway.  It's always possible I misread the code (as
> > I don't use namespaces and don't have tools handy to try them).
> > 
> 
> Should have checked myself before posting my previous reply -- but
> this doesn't seem to work.  /proc/NNN/root is represented as a
> symlink, but when you CLONE_NS and then try to look at another one of
> your process' /proc/NNN/root the link doesn't seem to have a target
> and you get permission denied on all accesses.

I've looked at the code.  Look in fs/proc/base.c (Linux 2.6.10),
proc_root_link().

I don't see anything there to prevent you from traversing to the
mounts in the other namespace.

So why is it failing?  Any idea?

> I haven't looked at the underlying procfs code, but adapting procfs
> for this sort of purpose feels wrong.

Having a file/directory which represents namespaces held by another
process makes much more sense to me than new system calls and
inventing yet another id space to represent namespaces.

And, given that you can look at the filesystems another process can
see by doing ptrace on it, it might as well be accessible in a more
natural way.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-28 13:28                     ` Eric Van Hensbergen
@ 2005-04-28 19:22                       ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-28 19:22 UTC (permalink / raw)
  To: Eric Van Hensbergen
  Cc: Pavel Machek, Al Viro, Miklos Szeredi, hch, linux-fsdevel,
	linux-kernel, akpm

Eric Van Hensbergen wrote:
> > Does chroot into /proc/NNN/root cause the chroot'ing process to adopt
> > the namespace of NNN?  Looking at the code, I think it does.
> 
> I've been thinking about this a bit more...would you even need chroot?
> (wouldn't exposing chroot functionality to a user incur additional
> security risk?  I guess it would be okay as long as you were only
> chrooting to one of your other process' roots?)

You don't need to let an ordinary user do chroot.

The login process can do it before it changes uid to the user, the
same as it does to set up all the other per-user parameters.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-28 19:20                       ` Jamie Lokier
@ 2005-04-28 19:39                         ` Ram
  2005-04-28 22:08                           ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Ram @ 2005-04-28 19:39 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Eric Van Hensbergen, Pavel Machek, Al Viro, Miklos Szeredi, hch,
	linux-fsdevel, linux-kernel, Andrew Morton

On Thu, 2005-04-28 at 12:20, Jamie Lokier wrote:
> Eric Van Hensbergen wrote:
> > > It's called /proc/NNN/root.
> > > 
> > > So no new system calls are needed.  A daemon to hand out per-user
> > > namespaces (or any other policy) can be written using existing
> > > kernels, and those namespaces can be joined using chroot.
> > > 
> > > That's the theory anyway.  It's always possible I misread the code (as
> > > I don't use namespaces and don't have tools handy to try them).
> > > 
> > 
> > Should have checked myself before posting my previous reply -- but
> > this doesn't seem to work.  /proc/NNN/root is represented as a
> > symlink, but when you CLONE_NS and then try to look at another one of
> > your process' /proc/NNN/root the link doesn't seem to have a target
> > and you get permission denied on all accesses.
> 
> I've looked at the code.  Look in fs/proc/base.c (Linux 2.6.10),
> proc_root_link().
> 
> I don't see anything there to prevent you from traversing to the
> mounts in the other namespace.
> 
> So why is it failing?  Any idea?

Since you are traversing a symlink, you will be traversing the symlink
in the context of traversing process's namespace. 

If process 'x' is traversing /proc/y/root , the lookup for the root
dentry will happen in the context of process x's  namespace, and not
process y's namespace. Hence process 'x' wont really get into
the namespace of the process y.

RP


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-28 19:39                         ` Ram
@ 2005-04-28 22:08                           ` Jamie Lokier
  2005-04-29  7:57                             ` Ram
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-28 22:08 UTC (permalink / raw)
  To: Ram
  Cc: Eric Van Hensbergen, Pavel Machek, Al Viro, Miklos Szeredi, hch,
	linux-fsdevel, linux-kernel, Andrew Morton

Ram wrote:
> > I've looked at the code.  Look in fs/proc/base.c (Linux 2.6.10),
> > proc_root_link().
> > 
> > I don't see anything there to prevent you from traversing to the
> > mounts in the other namespace.
> > 
> > So why is it failing?  Any idea?
> 
> Since you are traversing a symlink, you will be traversing the symlink
> in the context of traversing process's namespace. 
> 
> If process 'x' is traversing /proc/y/root , the lookup for the root
> dentry will happen in the context of process x's  namespace, and not
> process y's namespace. Hence process 'x' wont really get into
> the namespace of the process y.

Lookups don't happen in the context of a namespace.

They happen in the context of a vfsmnt.  And the switch to a new
vfsmnt is done by matching against (dentry,parent-vfsmnt) pairs.
current->namespace is only checked for mount & unmount operations, not
for path lookups.

Which means proc_root_link, when it switches to the vfsmnt at the root
of the other process, should traverse into the tree of vfsmnts which
make up the other namespace.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-28 22:08                           ` Jamie Lokier
@ 2005-04-29  7:57                             ` Ram
  2005-04-29 14:13                               ` Miklos Szeredi
  0 siblings, 1 reply; 95+ messages in thread
From: Ram @ 2005-04-29  7:57 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Eric Van Hensbergen, Pavel Machek, Al Viro, Miklos Szeredi, hch,
	linux-fsdevel, linux-kernel, Andrew Morton

On Thu, 2005-04-28 at 15:08, Jamie Lokier wrote:
> Ram wrote:
> > > I've looked at the code.  Look in fs/proc/base.c (Linux 2.6.10),
> > > proc_root_link().
> > > 
> > > I don't see anything there to prevent you from traversing to the
> > > mounts in the other namespace.
> > > 
> > > So why is it failing?  Any idea?
> > 
> > Since you are traversing a symlink, you will be traversing the symlink
> > in the context of traversing process's namespace. 
> > 
> > If process 'x' is traversing /proc/y/root , the lookup for the root
> > dentry will happen in the context of process x's  namespace, and not
> > process y's namespace. Hence process 'x' wont really get into
> > the namespace of the process y.
> 
> Lookups don't happen in the context of a namespace.
> 
> They happen in the context of a vfsmnt.  And the switch to a new
> vfsmnt is done by matching against (dentry,parent-vfsmnt) pairs.
> current->namespace is only checked for mount & unmount operations, not
> for path lookups.

Looked deeper into the code, and realized that in procfs, the symlink is
not followed through link_path_walk(). instead it is expected to
return the root vfsmount of the traversed process as you rightly
pointed.

 
> 
> Which means proc_root_link, when it switches to the vfsmnt at the root
> of the other process, should traverse into the tree of vfsmnts which
> make up the other namespace.

Yes. But proc_check_root() in proc_pid_follow_link() is failing the 
traversal, because it is expecting the root vfsmount of the traversed
process to belong to the vfsmount tree of the traversing process.
In other words its expecting them to be both in the same namespace.

The permissions get denied by this code in proc_check_root():

         while (vfsmnt != our_vfsmnt) {
                if (vfsmnt == vfsmnt->mnt_parent)
                        goto out;
                de = vfsmnt->mnt_mountpoint;
                vfsmnt = vfsmnt->mnt_parent;
        }

RP
> -- Jamie


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-29  7:57                             ` Ram
@ 2005-04-29 14:13                               ` Miklos Szeredi
  2005-04-29 14:42                                 ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-29 14:13 UTC (permalink / raw)
  To: linuxram
  Cc: jamie, ericvh, pavel, viro, miklos, hch, linux-fsdevel,
	linux-kernel, akpm

> > 
> > Which means proc_root_link, when it switches to the vfsmnt at the root
> > of the other process, should traverse into the tree of vfsmnts which
> > make up the other namespace.
> 
> Yes. But proc_check_root() in proc_pid_follow_link() is failing the 
> traversal, because it is expecting the root vfsmount of the traversed
> process to belong to the vfsmount tree of the traversing process.
> In other words its expecting them to be both in the same namespace.
> 
> The permissions get denied by this code in proc_check_root():
> 

Removing the check makes chroot enter the tree under the other
process's namespace.  However it does not actually change the
namespace, hence mount/umount won't work.

So joinig a namespace does need a new syscall unfortunately.

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-29 14:13                               ` Miklos Szeredi
@ 2005-04-29 14:42                                 ` Jamie Lokier
  2005-04-29 14:50                                   ` Question about current->namespace and check_mnt() Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-29 14:42 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linuxram, ericvh, pavel, viro, hch, linux-fsdevel, linux-kernel,
	akpm

Miklos Szeredi wrote:
> Removing the check makes chroot enter the tree under the other
> process's namespace.  However it does not actually change the
> namespace, hence mount/umount won't work.
> 
> So joinig a namespace does need a new syscall unfortunately.

It would be trivial to copy mnt->mnt_namespace to current->namespace
in set_fs_root.  No need for a syscall just for that.

Given that it works, the right place to decide whether it's allowed is
the permissions on /proc/NNN/root.  But remember that you can already
access another process' namespace using ptrace on that process, so
this doesn't relax security if /proc/NNN/root can be entered whenever
ptrace is allowed.

I would really like to know what the purpose of check_mnt() is in
namespace.c.  In standard kernels you can't enter another process'
namespace (without the change you tried in proc/base.c), so I don't see
how check_mnt() can _ever_ fail.  Can it?

And if it can't fail, is there any need for current->namespace, or can
it just be removed?

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Question about current->namespace and check_mnt()
  2005-04-29 14:42                                 ` Jamie Lokier
@ 2005-04-29 14:50                                   ` Jamie Lokier
  0 siblings, 0 replies; 95+ messages in thread
From: Jamie Lokier @ 2005-04-29 14:50 UTC (permalink / raw)
  To: viro
  Cc: Miklos Szeredi, linuxram, ericvh, pavel, hch, linux-fsdevel,
	linux-kernel, akpm

Hi Al,

I have a specific namespace.c question:

I really like to know what the purpose of check_mnt() is in
namespace.c.  In standard kernels you can't enter another process'
namespace so I don't see how check_mnt() can _ever_ fail.  Can it
fail, or in other words, what is the purpose of that check?

And if it can't fail, is there really a need for current->namespace, or
can it just be removed?

Also, I would think the current process' rootmnt->mnt_namespace would
adequately define the "current process namespace", so making
current->namespace redundant in that way.  Is that right?

Thanks,
-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25 19:07               ` Jamie Lokier
  2005-04-26  9:29                 ` Pavel Machek
@ 2005-04-30  8:33                 ` Christoph Hellwig
  2005-04-30 16:47                   ` Ram
  1 sibling, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2005-04-30  8:33 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Pavel Machek, Al Viro, Miklos Szeredi, hch, linux-fsdevel,
	linux-kernel, akpm

On Mon, Apr 25, 2005 at 08:07:34PM +0100, Jamie Lokier wrote:
> Pavel Machek wrote:
> > > > ... is the same as for the same question with "set of mounts" replaced
> > > > with "environment variables".
> > > 
> > > Not quite.
> > > 
> > > After changing environment variables in .profile, you can copy them to
> > > other shells using ". ~/.profile".
> > > 
> > > There is no analogous mechanism to copy namespaces.
> > 
> > Actually, after you add right mount xyzzy /foo lines into .profile,
> > you can just . ~/.profile ;-).
> 
> Is there a mount command that can do that?  We're talking about
> private mounts - invisible to other namespaces, which includes the
> other shells.
> 
> If there was a /proc/NNN/namespace, that would do the trick :)

I don't think you need a /proc/NNN/namespace, /proc/NNN/mounts already
contains a mount table.  It's pretty trivial to write a small shellscript
to parse that, compare with the current namespace and do all mount/umounts
to make them fit the other processes namespace.  Real problem here are
filesystems that don't implement ->show_options or do so only partially
so that some options are lost.


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  9:58                   ` Miklos Szeredi
  2005-04-25 11:45                     ` Jan Hudec
@ 2005-04-30  8:35                     ` Christoph Hellwig
  2005-04-30  9:25                       ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2005-04-30  8:35 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: bulb, viro, hch, linux-fsdevel, linux-kernel, akpm

On Mon, Apr 25, 2005 at 11:58:50AM +0200, Miklos Szeredi wrote:
> > I can't write a script that reads your mind. But I sure can write
> > a script that finds out what you mounted in the other shells (with help
> > of a little wrapper around the mount command).
> 
> How do you bind mount it from a different namespace?  You _do_ need
> bind mount, since a new mount might require password input, etc...

Not nessecarily.  The filesystem gets called into ->get_sb for every mount,
and can then decided whether to return an existing superblock instance or
setup a new one.  If the credentials for the new mount match an old one
it can just reuse it.  (e.g. for block based filesystem it will always reuse
right now)


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-25  9:48               ` Olivier Galibert
  2005-04-25 16:37                 ` Tim Hockin
@ 2005-04-30  8:37                 ` Christoph Hellwig
  1 sibling, 0 replies; 95+ messages in thread
From: Christoph Hellwig @ 2005-04-30  8:37 UTC (permalink / raw)
  To: Olivier Galibert, linux-fsdevel, linux-kernel

On Mon, Apr 25, 2005 at 11:48:04AM +0200, Olivier Galibert wrote:
> On Sun, Apr 24, 2005 at 10:19:42PM +0100, Al Viro wrote:
> > Of course you can.  It does execute the obvious set of rc files.
> 
> Is there a possibility for a process to change its namespace to
> another existing one?  That would be needed to have a per-user
> namespace you go to from rc files or pam.

It is not right now, and I don't think joining a namespace is a concept
that fits very well into our architecture.  What does make sense is an
unshare() syscall that takes the CLONE_* argument and unshares those in
the current process from the parent without creating a new process.  Then
you can easily reproduce another namespace by value instead of by reference.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30  8:35                     ` Christoph Hellwig
@ 2005-04-30  9:25                       ` Miklos Szeredi
  2005-04-30  9:42                         ` Jamie Lokier
  2005-05-11  9:00                         ` Christoph Hellwig
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-30  9:25 UTC (permalink / raw)
  To: hch; +Cc: bulb, viro, hch, linux-fsdevel, linux-kernel, akpm

> > > I can't write a script that reads your mind. But I sure can write
> > > a script that finds out what you mounted in the other shells (with help
> > > of a little wrapper around the mount command).
> > 
> > How do you bind mount it from a different namespace?  You _do_ need
> > bind mount, since a new mount might require password input, etc...
> 
> Not nessecarily.  The filesystem gets called into ->get_sb for every mount,
> and can then decided whether to return an existing superblock instance or
> setup a new one.  If the credentials for the new mount match an old one
> it can just reuse it.  (e.g. for block based filesystem it will always reuse
> right now)

And if the credentials are checked in userspace (sshfs)?

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30  9:25                       ` Miklos Szeredi
@ 2005-04-30  9:42                         ` Jamie Lokier
  2005-04-30 10:14                           ` Miklos Szeredi
  2005-05-11  9:00                         ` Christoph Hellwig
  1 sibling, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-30  9:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

Miklos Szeredi wrote:
> > > How do you bind mount it from a different namespace?  You _do_ need
> > > bind mount, since a new mount might require password input, etc...
> > 
> > Not nessecarily.  The filesystem gets called into ->get_sb for every mount,
> > and can then decided whether to return an existing superblock instance or
> > setup a new one.  If the credentials for the new mount match an old one
> > it can just reuse it.  (e.g. for block based filesystem it will always reuse
> > right now)
> 
> And if the credentials are checked in userspace (sshfs)?

Well, if you can find a way to tell the userspace FUSE daemon to know
that the mount is being done by the same user as the existing mount,
you don't need (or want) to check the credentials - you want the FUSE
daemon to tell the kernel code which superblock to reuse.

This hack is a bit nasty - namespace per login, copying mounts
from another login's namespace - but it would work.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30  9:42                         ` Jamie Lokier
@ 2005-04-30 10:14                           ` Miklos Szeredi
  2005-04-30 14:36                             ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-30 10:14 UTC (permalink / raw)
  To: jamie; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

> Well, if you can find a way to tell the userspace FUSE daemon to know
> that the mount is being done by the same user as the existing mount,
> you don't need (or want) to check the credentials - you want the FUSE
> daemon to tell the kernel code which superblock to reuse.

It sounds very _very_ complicated compared to just using bind mounts.

And maybe the user _does_ want a new connection to the same server
(for whatever reason).  Why should we _force_ a sharing of
superblocks?

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 10:14                           ` Miklos Szeredi
@ 2005-04-30 14:36                             ` Jamie Lokier
  2005-04-30 15:59                               ` Miklos Szeredi
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-30 14:36 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

Miklos Szeredi wrote:
> > Well, if you can find a way to tell the userspace FUSE daemon to know
> > that the mount is being done by the same user as the existing mount,
> > you don't need (or want) to check the credentials - you want the FUSE
> > daemon to tell the kernel code which superblock to reuse.
> 
> It sounds very _very_ complicated compared to just using bind mounts.
> 
> And maybe the user _does_ want a new connection to the same server
> (for whatever reason).  Why should we _force_ a sharing of
> superblocks?

The point is that you can decide whether to do that in userspace.
It's up to whatever code you put in the _userspace_ FUSE commands.

No kernel support for bind mounts from another namespace is required.

Actually, in terms of complexity, it's not much different from using
bind mounts.  Either way involves finding all the mounts of another
session and copying them one by one: either by getting confirmation
from the daemon to attach to the same superblock, or by getting
handles from the daemon for all the individual directories to bind
mount.

In all, I think private namespaces are still the cleaner way to do it
_when_ a user wants their mounts to appear in multiple sessions anyway.

But bind mounts or superblock sharing are more flexible, at the same
time as being more cumbersome as a user interface.

-- JAmie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 14:36                             ` Jamie Lokier
@ 2005-04-30 15:59                               ` Miklos Szeredi
  2005-04-30 16:42                                 ` Jamie Lokier
  0 siblings, 1 reply; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-30 15:59 UTC (permalink / raw)
  To: jamie; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

> Actually, in terms of complexity, it's not much different from using
> bind mounts.

As has been suggested by Pavel, bind mounting foreign namespaces could
just be done with a new bind_fd(fd, path) syscall and file descriptor
passing with SCM_RIGHTS.

That sounds to me orders of magnitude less complex (on the kernel side
at least) than sb sharing.

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 15:59                               ` Miklos Szeredi
@ 2005-04-30 16:42                                 ` Jamie Lokier
  2005-04-30 17:07                                   ` Miklos Szeredi
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-30 16:42 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

Miklos Szeredi wrote:
> > Actually, in terms of complexity, it's not much different from using
> > bind mounts.
> 
> As has been suggested by Pavel, bind mounting foreign namespaces could
> just be done with a new bind_fd(fd, path) syscall and file descriptor
> passing with SCM_RIGHTS.

Yes, he's right.

But you don't need a new system call to bind an fd.

"mount --bind /proc/self/fd/N mount_point" works, try it.

> That sounds to me orders of magnitude less complex (on the kernel side
> at least) than sb sharing.

In terms of what happens in the kernel, they're almost exactly the
same: either way, a super block ends up shared by two mounts.  That's
what I meant.

I agree that in terms of what userspace has to do, if just binding
works that's simpler.  And it does seem to work with the above mount
command.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30  8:33                 ` [PATCH] private mounts Christoph Hellwig
@ 2005-04-30 16:47                   ` Ram
  0 siblings, 0 replies; 95+ messages in thread
From: Ram @ 2005-04-30 16:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jamie Lokier, Pavel Machek, Al Viro, Miklos Szeredi,
	linux-fsdevel, linux-kernel, Andrew Morton

On Sat, 2005-04-30 at 01:33, Christoph Hellwig wrote:
> On Mon, Apr 25, 2005 at 08:07:34PM +0100, Jamie Lokier wrote:
> > Pavel Machek wrote:
> > > > > ... is the same as for the same question with "set of mounts" replaced
> > > > > with "environment variables".
> > > > 
> > > > Not quite.
> > > > 
> > > > After changing environment variables in .profile, you can copy them to
> > > > other shells using ". ~/.profile".
> > > > 
> > > > There is no analogous mechanism to copy namespaces.
> > > 
> > > Actually, after you add right mount xyzzy /foo lines into .profile,
> > > you can just . ~/.profile ;-).
> > 
> > Is there a mount command that can do that?  We're talking about
> > private mounts - invisible to other namespaces, which includes the
> > other shells.
> > 
> > If there was a /proc/NNN/namespace, that would do the trick :)
> 
> I don't think you need a /proc/NNN/namespace, /proc/NNN/mounts already
> contains a mount table.  It's pretty trivial to write a small shellscript
> to parse that, compare with the current namespace and do all mount/umounts
> to make them fit the other processes namespace.  Real problem here are
> filesystems that don't implement ->show_options or do so only partially
> so that some options are lost.

The other problem is: How would new mounts  in any of these namespaces
propogate to other namespaces owned by the same user? 

I mean, how will the other namespace's belonging to the same user, be
able to pull the mounts into their namespaces?  shared subtree won't be
a solution because these namespaces won't have a parent-child
relationship to begin with, for the propogation to be set up.


RP



> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 16:42                                 ` Jamie Lokier
@ 2005-04-30 17:07                                   ` Miklos Szeredi
  2005-04-30 18:20                                     ` Olivier Galibert
  2005-04-30 23:54                                     ` Jamie Lokier
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-04-30 17:07 UTC (permalink / raw)
  To: jamie; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

> But you don't need a new system call to bind an fd.
> 
> "mount --bind /proc/self/fd/N mount_point" works, try it.

Ahh, yes :)

Still proc_check_root() has to be relaxed, to allow dereferencing link
under a different namespace.  Maybe the check should be skipped for
capable(CAP_SYS_ADMIN) or similar.

What do people think about that?

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 17:07                                   ` Miklos Szeredi
@ 2005-04-30 18:20                                     ` Olivier Galibert
  2005-04-30 23:58                                       ` Jamie Lokier
  2005-04-30 23:54                                     ` Jamie Lokier
  1 sibling, 1 reply; 95+ messages in thread
From: Olivier Galibert @ 2005-04-30 18:20 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: jamie, hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

On Sat, Apr 30, 2005 at 07:07:56PM +0200, Miklos Szeredi wrote:
> > But you don't need a new system call to bind an fd.
> > 
> > "mount --bind /proc/self/fd/N mount_point" works, try it.
> 
> Ahh, yes :)
> 
> Still proc_check_root() has to be relaxed, to allow dereferencing link
> under a different namespace.  Maybe the check should be skipped for
> capable(CAP_SYS_ADMIN) or similar.
> 
> What do people think about that?

To me it looks like an atrocious hack that works only because of the
way the implementation is done and not really by design.  A well
defined interface where you want to do is explicitely said is way less
annoying long term.  I don't know what the right approach would be
(join <ns> vs. exec in <ns> vs. clone in <ns>) or even what a
namespace reference should look like (fd, pid, something else), and
probably only Al has a good idea of that.  Al, you've been quite
silent here.  What do you think the right method/interface would be to
start an interactive shell in a pre-existing different namespace?

  OG.

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 17:07                                   ` Miklos Szeredi
  2005-04-30 18:20                                     ` Olivier Galibert
@ 2005-04-30 23:54                                     ` Jamie Lokier
  2005-05-01  5:56                                       ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-30 23:54 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

Miklos Szeredi wrote:
> > But you don't need a new system call to bind an fd.
> > 
> > "mount --bind /proc/self/fd/N mount_point" works, try it.
> 
> Ahh, yes :)
> 
> Still proc_check_root() has to be relaxed, to allow dereferencing link
> under a different namespace.

Not necessary.

Why not have the FUSE daemon keep open a file descriptor for the
directory it's mounted on, and have it sent that to new would-be
mounters of the same directory using a unix domain socket (rather as
Pavel suggested)?

> Maybe the check should be skipped for
> capable(CAP_SYS_ADMIN) or similar.

No.  The check is to prevent processes in chroot jails from accessing
directories outside their jail.  Even CAP_SYS_ADMIN processes must be
forbidden from doing that.

But proc_check_root is unnecessarily strict, in that it prevents a
process from traversing into a "child" namespace.

IMHO, a better security restriction anyway would be for processes in
chroot jails to not be able to see processes outside the jail in /proc
- only processes inside the jail should be visible.  I think everyone
agrees that would be best.

If that were implemented, then proc_check_root would be redundant and
could be removed entirely.

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 18:20                                     ` Olivier Galibert
@ 2005-04-30 23:58                                       ` Jamie Lokier
  2005-05-01  2:39                                         ` Ram
  0 siblings, 1 reply; 95+ messages in thread
From: Jamie Lokier @ 2005-04-30 23:58 UTC (permalink / raw)
  To: Olivier Galibert, Miklos Szeredi, hch, bulb, viro, linux-fsdevel,
	linux-kernel, akpm

Olivier Galibert wrote:
> > > "mount --bind /proc/self/fd/N mount_point" works, try it.
> > 
> > What do people think about that?
> 
> To me it looks like an atrocious hack that works only because of the
> way the implementation is done and not really by design.

>From fs/namespace.c:do_loopback, the function which does bind mounts:

	if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {

check_mnt() verifies that a mountpoint is in the same namespace as the
current process.  recurse is set for --rbind mounts, but not --bind mounts.

Notice how old_nd.mnt is explicitly _not_ checked for being in the current
namespace when doing --bind?

That says to me that Al thought about this case, and coded for it...

(I'm still not clear why the check_mnt() calls are needed at all, though).

-- Jamie

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 23:58                                       ` Jamie Lokier
@ 2005-05-01  2:39                                         ` Ram
  0 siblings, 0 replies; 95+ messages in thread
From: Ram @ 2005-05-01  2:39 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Olivier Galibert, Miklos Szeredi, hch, bulb, viro, linux-fsdevel,
	linux-kernel, Andrew Morton

On Sat, 2005-04-30 at 16:58, Jamie Lokier wrote:
> Olivier Galibert wrote:
> > > > "mount --bind /proc/self/fd/N mount_point" works, try it.
> > > 
> > > What do people think about that?
> > 
> > To me it looks like an atrocious hack that works only because of the
> > way the implementation is done and not really by design.
> 
> >From fs/namespace.c:do_loopback, the function which does bind mounts:
> 
> 	if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
> 
> check_mnt() verifies that a mountpoint is in the same namespace as the
> current process.  recurse is set for --rbind mounts, but not --bind mounts.
> 
> Notice how old_nd.mnt is explicitly _not_ checked for being in the current
> namespace when doing --bind?

> That says to me that Al thought about this case, and coded for it...
> 
> (I'm still not clear why the check_mnt() calls are needed at all, though).
> 
Making a wild guess.

What if some filesystem allowed access to vfsmount in other namespace?
Just like the proc filesystem having the ability to do so, but
marginally stops it through the check in proc_check_root().

However the check you mentioned above where-a-bind-mount-across-
namespace is allowed, implies that there is some legal way of getting
access to vfsmounts in other namespace.  Or maybe a remote possibility
that its a bug?

RP


> -- Jamie
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30 23:54                                     ` Jamie Lokier
@ 2005-05-01  5:56                                       ` Miklos Szeredi
  2005-05-01  6:39                                         ` Miklos Szeredi
  2005-05-01 15:41                                         ` Eric Van Hensbergen
  0 siblings, 2 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-05-01  5:56 UTC (permalink / raw)
  To: jamie; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

> Not necessary.
> 
> Why not have the FUSE daemon keep open a file descriptor for the
> directory it's mounted on, and have it sent that to new would-be
> mounters of the same directory using a unix domain socket (rather as
> Pavel suggested)?

How does that help?  It doesn't matter _which_ process you try to bind
mount /proc/XXX/fd/N from, the result will be the same.

> No.  The check is to prevent processes in chroot jails from accessing
> directories outside their jail.  Even CAP_SYS_ADMIN processes must be
> forbidden from doing that.

As someone pointed out, CAP_SYS_ADMIN processes can already escape the
chroot jail with CLONE_NEWNS.  (fd=open("."); clone(CLONE_NEWNS);
[child:] fchdir(fd); chdir(".."))

> But proc_check_root is unnecessarily strict, in that it prevents a
> process from traversing into a "child" namespace.
> 
> IMHO, a better security restriction anyway would be for processes in
> chroot jails to not be able to see processes outside the jail in /proc
> - only processes inside the jail should be visible.  I think everyone
> agrees that would be best.

Dunno.  It's a big change possibly breaking existing applications.
Chroot probably has other uses than jailing.

> If that were implemented, then proc_check_root would be redundant and
> could be removed entirely.

Yes. 

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-05-01  5:56                                       ` Miklos Szeredi
@ 2005-05-01  6:39                                         ` Miklos Szeredi
  2005-05-01 15:41                                         ` Eric Van Hensbergen
  1 sibling, 0 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-05-01  6:39 UTC (permalink / raw)
  To: jamie; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

> But proc_check_root is unnecessarily strict, in that it prevents a
> process from traversing into a "child" namespace.
> 
> IMHO, a better security restriction anyway would be for processes in
> chroot jails to not be able to see processes outside the jail in /proc
> - only processes inside the jail should be visible.  I think everyone
> agrees that would be best.

Creating a new namespace would also have the same effect (only
processes using that namespace are visible).  It would be rather ugly,
if a user could not see processes in other login sessions, just
because he uses private namespaces.

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-05-01  5:56                                       ` Miklos Szeredi
  2005-05-01  6:39                                         ` Miklos Szeredi
@ 2005-05-01 15:41                                         ` Eric Van Hensbergen
  1 sibling, 0 replies; 95+ messages in thread
From: Eric Van Hensbergen @ 2005-05-01 15:41 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: jamie, hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

On 5/1/05, Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> As someone pointed out, CAP_SYS_ADMIN processes can already escape the
> chroot jail with CLONE_NEWNS.  (fd=open("."); clone(CLONE_NEWNS);
> [child:] fchdir(fd); chdir(".."))
> 

This really does seem like a bug.  Is there are a reason behind this
"feature", or should one of us be looking into a patch to correct
this?

Miklos you earlier suggested:
>>>How about fixing fchdir, so it checks whether you gone outside the
>>>tree under current->fs->rootmnt?  Should be fairly easy to do.

         -eric

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-04-30  9:25                       ` Miklos Szeredi
  2005-04-30  9:42                         ` Jamie Lokier
@ 2005-05-11  9:00                         ` Christoph Hellwig
  2005-05-11 10:42                           ` Miklos Szeredi
  1 sibling, 1 reply; 95+ messages in thread
From: Christoph Hellwig @ 2005-05-11  9:00 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: hch, bulb, viro, linux-fsdevel, linux-kernel, akpm

On Sat, Apr 30, 2005 at 11:25:10AM +0200, Miklos Szeredi wrote:
> > > > I can't write a script that reads your mind. But I sure can write
> > > > a script that finds out what you mounted in the other shells (with help
> > > > of a little wrapper around the mount command).
> > > 
> > > How do you bind mount it from a different namespace?  You _do_ need
> > > bind mount, since a new mount might require password input, etc...
> > 
> > Not nessecarily.  The filesystem gets called into ->get_sb for every mount,
> > and can then decided whether to return an existing superblock instance or
> > setup a new one.  If the credentials for the new mount match an old one
> > it can just reuse it.  (e.g. for block based filesystem it will always reuse
> > right now)
> 
> And if the credentials are checked in userspace (sshfs)?

The it needs to call to userspace in ->get_sb..

^ permalink raw reply	[flat|nested] 95+ messages in thread

* Re: [PATCH] private mounts
  2005-05-11  9:00                         ` Christoph Hellwig
@ 2005-05-11 10:42                           ` Miklos Szeredi
  0 siblings, 0 replies; 95+ messages in thread
From: Miklos Szeredi @ 2005-05-11 10:42 UTC (permalink / raw)
  To: hch; +Cc: bulb, viro, linux-fsdevel, linux-kernel, akpm

> > > > > I can't write a script that reads your mind. But I sure can
> > > > > write a script that finds out what you mounted in the other
> > > > > shells (with help of a little wrapper around the mount
> > > > > command).
> > > > 
> > > > How do you bind mount it from a different namespace?  You _do_
> > > > need bind mount, since a new mount might require password
> > > > input, etc...
> > > 
> > > Not nessecarily.  The filesystem gets called into ->get_sb for
> > > every mount, and can then decided whether to return an existing
> > > superblock instance or setup a new one.  If the credentials for
> > > the new mount match an old one it can just reuse it.  (e.g. for
> > > block based filesystem it will always reuse right now)
> > 
> > And if the credentials are checked in userspace (sshfs)?
> 
> The it needs to call to userspace in ->get_sb..

That's clear.

What I don't get is what's the point in adding complexity to the
kernel and userspace programs, when it can be done without _any_
changes, just by doing a bind mount.

It's not just calling ->get_sb.  It's finding the right filesystem
daemon, that has been started with the exact same command line
arguments, environment etc.

It's just not practical.

Miklos

^ permalink raw reply	[flat|nested] 95+ messages in thread

end of thread, other threads:[~2005-05-11 10:43 UTC | newest]

Thread overview: 95+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-24 20:08 [PATCH] private mounts Miklos Szeredi
2005-04-24 20:13 ` Al Viro
2005-04-24 20:45   ` Miklos Szeredi
2005-04-24 20:18 ` Christoph Hellwig
2005-04-24 20:50   ` Miklos Szeredi
2005-04-24 20:54     ` Al Viro
2005-04-24 20:59       ` Miklos Szeredi
2005-04-24 21:06         ` Christoph Hellwig
2005-04-24 21:12           ` Jamie Lokier
2005-04-24 21:06         ` Al Viro
2005-04-24 21:15           ` Miklos Szeredi
2005-04-24 21:19             ` Al Viro
2005-04-24 21:29               ` Miklos Szeredi
2005-04-24 21:39                 ` Jamie Lokier
2005-04-25  7:10                 ` Jan Hudec
2005-04-25  9:58                   ` Miklos Szeredi
2005-04-25 11:45                     ` Jan Hudec
2005-04-30  8:35                     ` Christoph Hellwig
2005-04-30  9:25                       ` Miklos Szeredi
2005-04-30  9:42                         ` Jamie Lokier
2005-04-30 10:14                           ` Miklos Szeredi
2005-04-30 14:36                             ` Jamie Lokier
2005-04-30 15:59                               ` Miklos Szeredi
2005-04-30 16:42                                 ` Jamie Lokier
2005-04-30 17:07                                   ` Miklos Szeredi
2005-04-30 18:20                                     ` Olivier Galibert
2005-04-30 23:58                                       ` Jamie Lokier
2005-05-01  2:39                                         ` Ram
2005-04-30 23:54                                     ` Jamie Lokier
2005-05-01  5:56                                       ` Miklos Szeredi
2005-05-01  6:39                                         ` Miklos Szeredi
2005-05-01 15:41                                         ` Eric Van Hensbergen
2005-05-11  9:00                         ` Christoph Hellwig
2005-05-11 10:42                           ` Miklos Szeredi
2005-04-24 21:43               ` Jamie Lokier
2005-04-25  7:14                 ` Jan Hudec
2005-04-27  9:14                 ` Helge Hafting
2005-04-25  9:48               ` Olivier Galibert
2005-04-25 16:37                 ` Tim Hockin
2005-04-30  8:37                 ` Christoph Hellwig
2005-04-25 21:09               ` Bryan Henderson
2005-04-26 13:46                 ` filesystem transactions API Ville Herva
2005-04-26 14:14                   ` Jamie Lokier
2005-04-26 14:22                     ` Artem B. Bityuckiy
2005-04-26 14:32                       ` Jamie Lokier
2005-04-26 14:46                         ` Artem B. Bityuckiy
2005-04-26 15:19                           ` Jamie Lokier
2005-04-26 15:01                         ` John Stoffel
2005-04-26 15:12                           ` Lars Marowsky-Bree
2005-04-26 15:19                           ` Trond Myklebust
2005-04-26 15:29                             ` Ritesh Kumar
2005-04-26 15:50                               ` Jamie Lokier
2005-04-26 16:44                               ` Trond Myklebust
2005-04-26 22:44                               ` Bryan Henderson
2005-04-26 15:47                             ` Jamie Lokier
2005-04-26 15:51                               ` Artem B. Bityuckiy
2005-04-26 15:56                                 ` Jamie Lokier
2005-04-26 16:01                                   ` Artem B. Bityuckiy
2005-04-27  9:14                                     ` Jan Hudec
2005-04-26 15:24                           ` Jamie Lokier
2005-04-26 17:22                             ` Diego Calleja
2005-04-26 17:38                               ` Jamie Lokier
2005-04-27  9:34                             ` Jan Hudec
2005-04-27 13:43                               ` Ville Herva
2005-04-27 15:17                                 ` Jamie Lokier
2005-04-26 15:40                       ` Charles P. Wright
2005-04-26 16:07                         ` Artem B. Bityuckiy
2005-04-26 17:22                           ` Charles P. Wright
2005-04-27  9:37                         ` Lars Marowsky-Bree
2005-04-27 13:36                       ` Andi Kleen
2005-04-26 14:25                   ` Trond Myklebust
2005-04-24 21:38           ` [PATCH] private mounts Jamie Lokier
2005-04-24 22:20             ` Ram
2005-04-24 22:22               ` Jamie Lokier
2005-04-25  6:00             ` Miklos Szeredi
2005-04-25  6:41               ` Ram
2005-04-25  9:55                 ` Miklos Szeredi
2005-04-25  7:22               ` Jan Hudec
2005-04-25 10:08                 ` Miklos Szeredi
2005-04-25 15:20             ` Pavel Machek
2005-04-25 19:07               ` Jamie Lokier
2005-04-26  9:29                 ` Pavel Machek
2005-04-26 14:07                   ` Jamie Lokier
2005-04-28 13:28                     ` Eric Van Hensbergen
2005-04-28 19:22                       ` Jamie Lokier
2005-04-28 13:47                     ` Eric Van Hensbergen
2005-04-28 19:20                       ` Jamie Lokier
2005-04-28 19:39                         ` Ram
2005-04-28 22:08                           ` Jamie Lokier
2005-04-29  7:57                             ` Ram
2005-04-29 14:13                               ` Miklos Szeredi
2005-04-29 14:42                                 ` Jamie Lokier
2005-04-29 14:50                                   ` Question about current->namespace and check_mnt() Jamie Lokier
2005-04-30  8:33                 ` [PATCH] private mounts Christoph Hellwig
2005-04-30 16:47                   ` Ram

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).