* [RFC] Support for stackable file systems on top of nfs
@ 2005-11-10 17:32 Dave Kleikamp
2005-11-10 20:07 ` Christoph Hellwig
2005-11-10 21:24 ` Trond Myklebust
0 siblings, 2 replies; 36+ messages in thread
From: Dave Kleikamp @ 2005-11-10 17:32 UTC (permalink / raw)
To: nfsv4, fsdevel
The following patch allows stackable file systems, such as ClearCase's
mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
points its inode->i_mapping to the lower file system's mapping. This
causes problems when nfs's address space operations try to extract the
open context from file->private_data.
The patch adds a small overhead of checking the file structure to see if
it contains an inode that is not the mapping's host.
I am curious if there are any other stackable file systems that could
benefit from this.
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
diff -Nurp linux-2.6.14-git/fs/nfs/direct.c linux/fs/nfs/direct.c
--- linux-2.6.14-git/fs/nfs/direct.c 2005-11-07 07:53:49.000000000 -0600
+++ linux/fs/nfs/direct.c 2005-11-09 14:58:59.000000000 -0600
@@ -604,7 +604,19 @@ nfs_direct_IO(int rw, struct kiocb *iocb
if (!is_sync_kiocb(iocb))
return result;
- ctx = (struct nfs_open_context *)file->private_data;
+ if (nfs_is_valid_file(file))
+ ctx = get_nfs_open_context((struct nfs_open_context *)
+ file->private_data);
+ else {
+ /* file belongs to a stackable file system.
+ * Can't trust the inode either */
+ inode = inode->i_mapping->host;
+
+ ctx = nfs_find_open_context(inode, NULL,
+ (rw == READ) ? FMODE_READ : FMODE_WRITE);
+ if (ctx == NULL)
+ return -EBADF;
+ }
switch (rw) {
case READ:
dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
@@ -623,6 +635,7 @@ nfs_direct_IO(int rw, struct kiocb *iocb
default:
break;
}
+ put_nfs_open_context(ctx);
return result;
}
diff -Nurp linux-2.6.14-git/fs/nfs/read.c linux/fs/nfs/read.c
--- linux-2.6.14-git/fs/nfs/read.c 2005-11-07 07:53:49.000000000 -0600
+++ linux/fs/nfs/read.c 2005-11-09 11:47:05.000000000 -0600
@@ -506,7 +506,7 @@ int nfs_readpage(struct file *file, stru
if (error)
goto out_error;
- if (file == NULL) {
+ if (!nfs_is_valid_file(file)) {
ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (ctx == NULL)
return -EBADF;
@@ -575,7 +575,7 @@ int nfs_readpages(struct file *filp, str
(long long)NFS_FILEID(inode),
nr_pages);
- if (filp == NULL) {
+ if (!nfs_is_valid_file(filp)) {
desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (desc.ctx == NULL)
return -EBADF;
diff -Nurp linux-2.6.14-git/fs/nfs/write.c linux/fs/nfs/write.c
--- linux-2.6.14-git/fs/nfs/write.c 2005-11-07 07:53:49.000000000 -0600
+++ linux/fs/nfs/write.c 2005-11-09 14:14:33.000000000 -0600
@@ -703,10 +703,16 @@ static struct nfs_page * nfs_update_requ
int nfs_flush_incompatible(struct file *file, struct page *page)
{
- struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct nfs_open_context *ctx;
struct inode *inode = page->mapping->host;
struct nfs_page *req;
int status = 0;
+
+ if (nfs_is_valid_file(file))
+ ctx = (struct nfs_open_context *)file->private_data;
+ else
+ ctx = NULL;
+
/*
* Look for a request corresponding to this page. If there
* is one, and it belongs to another file, we flush it out
@@ -733,7 +739,7 @@ int nfs_flush_incompatible(struct file *
int nfs_updatepage(struct file *file, struct page *page,
unsigned int offset, unsigned int count)
{
- struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct nfs_open_context *ctx;
struct inode *inode = page->mapping->host;
struct nfs_page *req;
int status = 0;
@@ -743,14 +749,23 @@ int nfs_updatepage(struct file *file, st
file->f_dentry->d_name.name, count,
(long long)(page_offset(page) +offset));
+ if (nfs_is_valid_file(file))
+ ctx = get_nfs_open_context((struct nfs_open_context *)
+ file->private_data);
+ else {
+ ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
+ if (!ctx)
+ return -EBADF;
+ }
+
if (IS_SYNC(inode)) {
status = nfs_writepage_sync(ctx, inode, page, offset, count, 0);
if (status > 0) {
if (offset == 0 && status == PAGE_CACHE_SIZE)
SetPageUptodate(page);
- return 0;
+ status = 0;
}
- return status;
+ goto out;
}
/* If we're not using byte range locks, and we know the page
@@ -803,6 +818,8 @@ done:
status, (long long)i_size_read(inode));
if (status < 0)
ClearPageUptodate(page);
+out:
+ put_nfs_open_context(ctx);
return status;
}
diff -Nurp linux-2.6.14-git/include/linux/nfs_fs.h linux/include/linux/nfs_fs.h
--- linux-2.6.14-git/include/linux/nfs_fs.h 2005-11-07 07:53:50.000000000 -0600
+++ linux/include/linux/nfs_fs.h 2005-11-09 11:44:53.000000000 -0600
@@ -350,6 +350,20 @@ static inline struct rpc_cred *nfs_file_
}
/*
+ * A stackable file system may have it's own file & inode structures, which
+ * point to the local inode's mapping. The address space operations cannot
+ * use the stackable file system's file structure to get to the open context
+ */
+static inline int nfs_is_valid_file(struct file *file)
+{
+ struct inode *inode;
+ if (!file)
+ return 0;
+ inode = file->f_dentry->d_inode;
+ return (inode == inode->i_mapping->host);
+}
+
+/*
* linux/fs/nfs/xattr.c
*/
#ifdef CONFIG_NFS_V3_ACL
--
David Kleikamp
IBM Linux Technology Center
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 17:32 Dave Kleikamp
@ 2005-11-10 20:07 ` Christoph Hellwig
2005-11-10 21:35 ` John T. Kohl
2005-11-10 21:24 ` Trond Myklebust
1 sibling, 1 reply; 36+ messages in thread
From: Christoph Hellwig @ 2005-11-10 20:07 UTC (permalink / raw)
To: Dave Kleikamp; +Cc: nfsv4, fsdevel
On Thu, Nov 10, 2005 at 11:32:22AM -0600, Dave Kleikamp wrote:
> The following patch allows stackable file systems, such as ClearCase's
> mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> points its inode->i_mapping to the lower file system's mapping. This
> causes problems when nfs's address space operations try to extract the
> open context from file->private_data.
>
> The patch adds a small overhead of checking the file structure to see if
> it contains an inode that is not the mapping's host.
>
> I am curious if there are any other stackable file systems that could
> benefit from this.
A stackable filesystem must never call underlying methods with it's
own file structures. Whatever filesystem you use (I suspect the broken
piepce of clearcase shit that's always causing trouble) needs to be fixed
instead.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 17:32 Dave Kleikamp
2005-11-10 20:07 ` Christoph Hellwig
@ 2005-11-10 21:24 ` Trond Myklebust
2005-11-10 21:36 ` Shaya Potter
1 sibling, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-10 21:24 UTC (permalink / raw)
To: Dave Kleikamp; +Cc: nfsv4, fsdevel
On Thu, 2005-11-10 at 11:32 -0600, Dave Kleikamp wrote:
> The following patch allows stackable file systems, such as ClearCase's
> mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> points its inode->i_mapping to the lower file system's mapping. This
> causes problems when nfs's address space operations try to extract the
> open context from file->private_data.
>
> The patch adds a small overhead of checking the file structure to see if
> it contains an inode that is not the mapping's host.
NACK
This is a fundamentally flawed approach. The nfs_find_open_context() is
designed for the mmap() case were you have a valid vm_area_struct, that
has a "struct file" with a valid NFS open context attached to it.
Existence of the file is guaranteed, but the readpage() and writepage()
interfaces don't actually pass the struct file down to the filesystem.
This is clearly not the case here.
If you want to make mvfs work correctly with NFS, then have it set up a
valid NFS struct file, and use that file with the NFS functions.
Anything else is borken.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 20:07 ` Christoph Hellwig
@ 2005-11-10 21:35 ` John T. Kohl
2005-11-10 21:40 ` Shaya Potter
` (3 more replies)
0 siblings, 4 replies; 36+ messages in thread
From: John T. Kohl @ 2005-11-10 21:35 UTC (permalink / raw)
To: nfsv4, fsdevel
> On Thu, Nov 10, 2005 at 11:32:22AM -0600, Dave Kleikamp wrote:
>> The following patch allows stackable file systems, such as ClearCase's
>> mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
>> points its inode->i_mapping to the lower file system's mapping. This
>> causes problems when nfs's address space operations try to extract the
>> open context from file->private_data.
>>
>> The patch adds a small overhead of checking the file structure to see if
>> it contains an inode that is not the mapping's host.
>>
>> I am curious if there are any other stackable file systems that could
>> benefit from this.
Let me explain a bit more what's going on here. MVFS would like to do
the same thing that CODA does. In the file->mmap() operation, CODA and
MVFS want to set up paging operations to be handled by the backing store
inode. See for example fs/coda/file.c:coda_file_mmap(), it sets
coda_inode->i_mapping = host_inode->i_mapping.
But this fails when host_inode is an NFS inode. NFS assumes
that when it gets paging operations, it can look at the file pointer
passed to the address_space_operations' readpage function, and that file
pointer will be for an open NFS file. If NFS is a backing store inode,
the file pointer is for the stacked file system's open file.
CODA certainly won't work today with NFS host inodes and mapped files.
I'm not surprised nobody noticed, since that seems like a poor way to
use CODA. Using NFS backing store is a primary use case for ClearCase
MVFS, so we noticed.
--
John Kohl
Senior Software Engineer
Rational Software
IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
The opinions expressed in this message do not reflect the views of my
employer.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:24 ` Trond Myklebust
@ 2005-11-10 21:36 ` Shaya Potter
2005-11-10 22:18 ` Trond Myklebust
0 siblings, 1 reply; 36+ messages in thread
From: Shaya Potter @ 2005-11-10 21:36 UTC (permalink / raw)
To: Trond Myklebust; +Cc: Dave Kleikamp, nfsv4, fsdevel
On Thu, 2005-11-10 at 16:24 -0500, Trond Myklebust wrote:
> On Thu, 2005-11-10 at 11:32 -0600, Dave Kleikamp wrote:
> > The following patch allows stackable file systems, such as ClearCase's
> > mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> > points its inode->i_mapping to the lower file system's mapping. This
> > causes problems when nfs's address space operations try to extract the
> > open context from file->private_data.
> >
> > The patch adds a small overhead of checking the file structure to see if
> > it contains an inode that is not the mapping's host.
>
> NACK
>
> This is a fundamentally flawed approach. The nfs_find_open_context() is
> designed for the mmap() case were you have a valid vm_area_struct, that
> has a "struct file" with a valid NFS open context attached to it.
> Existence of the file is guaranteed, but the readpage() and writepage()
> interfaces don't actually pass the struct file down to the filesystem.
>
> This is clearly not the case here.
>
> If you want to make mvfs work correctly with NFS, then have it set up a
> valid NFS struct file, and use that file with the NFS functions.
> Anything else is borken.
from my experiences with stackable file systems, I'm not sure the above
is totally correct. i.e. vm_area_structs only around while a process is
in use. Once a process exits, the file and vm_area_struct go away.
now imagine if you do a mmap shared write, don't sync and just exit the
program. Some time later the kernel will kick in, call writepage() and
write the page to disk, but your above "guarantees" are no longer
around.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:35 ` John T. Kohl
@ 2005-11-10 21:40 ` Shaya Potter
2005-11-10 21:57 ` John T. Kohl
2005-11-10 21:50 ` Christoph Hellwig
` (2 subsequent siblings)
3 siblings, 1 reply; 36+ messages in thread
From: Shaya Potter @ 2005-11-10 21:40 UTC (permalink / raw)
To: John T. Kohl; +Cc: nfsv4, fsdevel
On Thu, 2005-11-10 at 16:35 -0500, John T. Kohl wrote:
> > On Thu, Nov 10, 2005 at 11:32:22AM -0600, Dave Kleikamp wrote:
> >> The following patch allows stackable file systems, such as ClearCase's
> >> mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> >> points its inode->i_mapping to the lower file system's mapping. This
> >> causes problems when nfs's address space operations try to extract the
> >> open context from file->private_data.
> >>
> >> The patch adds a small overhead of checking the file structure to see if
> >> it contains an inode that is not the mapping's host.
> >>
> >> I am curious if there are any other stackable file systems that could
> >> benefit from this.
>
> Let me explain a bit more what's going on here. MVFS would like to do
> the same thing that CODA does. In the file->mmap() operation, CODA and
> MVFS want to set up paging operations to be handled by the backing store
> inode. See for example fs/coda/file.c:coda_file_mmap(), it sets
> coda_inode->i_mapping = host_inode->i_mapping.
>
> But this fails when host_inode is an NFS inode. NFS assumes
> that when it gets paging operations, it can look at the file pointer
> passed to the address_space_operations' readpage function, and that file
> pointer will be for an open NFS file. If NFS is a backing store inode,
> the file pointer is for the stacked file system's open file.
>
> CODA certainly won't work today with NFS host inodes and mapped files.
> I'm not surprised nobody noticed, since that seems like a poor way to
> use CODA. Using NFS backing store is a primary use case for ClearCase
> MVFS, so we noticed.
I think you'd notice it on other file systems as well. For instance, my
experience is that GFS doesn't play nice w/ stackable file systems that
try to stack on the a_ops. On the other hand, it's ok if it just passes
all page cache operations directly down to the lower file system.
OCFS2, on the other hand, seems to play better w/ stacking on the a_ops.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:35 ` John T. Kohl
2005-11-10 21:40 ` Shaya Potter
@ 2005-11-10 21:50 ` Christoph Hellwig
2005-11-11 2:31 ` Trond Myklebust
2005-11-14 15:56 ` David Howells
3 siblings, 0 replies; 36+ messages in thread
From: Christoph Hellwig @ 2005-11-10 21:50 UTC (permalink / raw)
To: John T. Kohl; +Cc: fsdevel, nfsv4
On Thu, Nov 10, 2005 at 04:35:47PM -0500, John T. Kohl wrote:
> Let me explain a bit more what's going on here. MVFS would like to do
> the same thing that CODA does. In the file->mmap() operation, CODA and
> MVFS want to set up paging operations to be handled by the backing store
> inode. See for example fs/coda/file.c:coda_file_mmap(), it sets
> coda_inode->i_mapping = host_inode->i_mapping.
>
> But this fails when host_inode is an NFS inode. NFS assumes
> that when it gets paging operations, it can look at the file pointer
> passed to the address_space_operations' readpage function, and that file
> pointer will be for an open NFS file. If NFS is a backing store inode,
> the file pointer is for the stacked file system's open file.
>
> CODA certainly won't work today with NFS host inodes and mapped files.
> I'm not surprised nobody noticed, since that seems like a poor way to
> use CODA. Using NFS backing store is a primary use case for ClearCase
> MVFS, so we noticed.
so coda is broken aswell, news at 11 ;-) when a file operation takes
a stuct file, directly or indirecly you absolutely must pass down
a stuct file of that filesystem. so wrap your address operations and
pass down the proper nfs file stuct.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:40 ` Shaya Potter
@ 2005-11-10 21:57 ` John T. Kohl
0 siblings, 0 replies; 36+ messages in thread
From: John T. Kohl @ 2005-11-10 21:57 UTC (permalink / raw)
To: nfsv4, fsdevel
>>>>> "Shaya" == Shaya Potter <spotter@cs.columbia.edu> writes:
>>>>>> > On Thu, Nov 10, 2005 at 11:32:22AM -0600, Dave Kleikamp wrote:
Dave>> The following patch allows stackable file systems, such as ClearCase's
Dave>> mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
Dave>> points its inode->i_mapping to the lower file system's mapping. This
Dave>> causes problems when nfs's address space operations try to extract the
Dave>> open context from file->private_data.
Dave>>
Dave>> The patch adds a small overhead of checking the file structure to see if
Dave>> it contains an inode that is not the mapping's host.
Dave>>
Dave>> I am curious if there are any other stackable file systems that could
Dave>> benefit from this.
John>> Let me explain a bit more what's going on here. MVFS would like to do
John>> the same thing that CODA does. In the file->mmap() operation, CODA and
John>> MVFS want to set up paging operations to be handled by the backing store
John>> inode. See for example fs/coda/file.c:coda_file_mmap(), it sets
John>> coda_inode-> i_mapping = host_inode->i_mapping.
John>>
John>> But this fails when host_inode is an NFS inode. NFS assumes
John>> that when it gets paging operations, it can look at the file pointer
John>> passed to the address_space_operations' readpage function, and that file
John>> pointer will be for an open NFS file. If NFS is a backing store inode,
John>> the file pointer is for the stacked file system's open file.
John>>
John>> CODA certainly won't work today with NFS host inodes and mapped files.
John>> I'm not surprised nobody noticed, since that seems like a poor way to
John>> use CODA. Using NFS backing store is a primary use case for ClearCase
John>> MVFS, so we noticed.
Shaya> I think you'd notice it on other file systems as well. For instance, my
Shaya> experience is that GFS doesn't play nice w/ stackable file systems that
Shaya> try to stack on the a_ops. On the other hand, it's ok if it just passes
Shaya> all page cache operations directly down to the lower file system.
Shaya> OCFS2, on the other hand, seems to play better w/ stacking on the a_ops.
Hmm, so thinking about this more, I think the minimal patch would only
change how the file structure is found for the address space operations,
and leave the other ones alone. (And FYI, yes MVFS does have a (struct
file *) open on the backing store inode, and uses it for
non-mapping-related calls.)
--
John Kohl
Senior Software Engineer - Rational Software - IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
<http://www.ibm.com/software/rational/>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:36 ` Shaya Potter
@ 2005-11-10 22:18 ` Trond Myklebust
2005-11-10 22:27 ` Shaya Potter
0 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-10 22:18 UTC (permalink / raw)
To: Shaya Potter; +Cc: Dave Kleikamp, nfsv4, fsdevel
On Thu, 2005-11-10 at 16:36 -0500, Shaya Potter wrote:
> On Thu, 2005-11-10 at 16:24 -0500, Trond Myklebust wrote:
> > On Thu, 2005-11-10 at 11:32 -0600, Dave Kleikamp wrote:
> > > The following patch allows stackable file systems, such as ClearCase's
> > > mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> > > points its inode->i_mapping to the lower file system's mapping. This
> > > causes problems when nfs's address space operations try to extract the
> > > open context from file->private_data.
> > >
> > > The patch adds a small overhead of checking the file structure to see if
> > > it contains an inode that is not the mapping's host.
> >
> > NACK
> >
> > This is a fundamentally flawed approach. The nfs_find_open_context() is
> > designed for the mmap() case were you have a valid vm_area_struct, that
> > has a "struct file" with a valid NFS open context attached to it.
> > Existence of the file is guaranteed, but the readpage() and writepage()
> > interfaces don't actually pass the struct file down to the filesystem.
> >
> > This is clearly not the case here.
> >
> > If you want to make mvfs work correctly with NFS, then have it set up a
> > valid NFS struct file, and use that file with the NFS functions.
> > Anything else is borken.
>
> from my experiences with stackable file systems, I'm not sure the above
> is totally correct. i.e. vm_area_structs only around while a process is
> in use. Once a process exits, the file and vm_area_struct go away.
...calling file->f_ops->release() in the process. Guess what happens
there?
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 22:18 ` Trond Myklebust
@ 2005-11-10 22:27 ` Shaya Potter
2005-11-10 22:40 ` Trond Myklebust
0 siblings, 1 reply; 36+ messages in thread
From: Shaya Potter @ 2005-11-10 22:27 UTC (permalink / raw)
To: Trond Myklebust; +Cc: fsdevel, nfsv4, Dave Kleikamp
On Thu, 2005-11-10 at 17:18 -0500, Trond Myklebust wrote:
> On Thu, 2005-11-10 at 16:36 -0500, Shaya Potter wrote:
> > On Thu, 2005-11-10 at 16:24 -0500, Trond Myklebust wrote:
> > > On Thu, 2005-11-10 at 11:32 -0600, Dave Kleikamp wrote:
> > > > The following patch allows stackable file systems, such as ClearCase's
> > > > mvfs, to run atop nfs. mvfs has it's own file and inode structures, but
> > > > points its inode->i_mapping to the lower file system's mapping. This
> > > > causes problems when nfs's address space operations try to extract the
> > > > open context from file->private_data.
> > > >
> > > > The patch adds a small overhead of checking the file structure to see if
> > > > it contains an inode that is not the mapping's host.
> > >
> > > NACK
> > >
> > > This is a fundamentally flawed approach. The nfs_find_open_context() is
> > > designed for the mmap() case were you have a valid vm_area_struct, that
> > > has a "struct file" with a valid NFS open context attached to it.
> > > Existence of the file is guaranteed, but the readpage() and writepage()
> > > interfaces don't actually pass the struct file down to the filesystem.
> > >
> > > This is clearly not the case here.
> > >
> > > If you want to make mvfs work correctly with NFS, then have it set up a
> > > valid NFS struct file, and use that file with the NFS functions.
> > > Anything else is borken.
> >
> > from my experiences with stackable file systems, I'm not sure the above
> > is totally correct. i.e. vm_area_structs only around while a process is
> > in use. Once a process exits, the file and vm_area_struct go away.
>
> ...calling file->f_ops->release() in the process. Guess what happens
> there?
from my experience, writepage is called signficantly after a process is
gone. I've seen this via instrumenting writepage() in using stackable
file systems and a simple program that maps a file, modifies it, and
then exits. the kernel hit the writepage() function 30 seconds or so
after the process existed.
ext2_release_file just does a ext2_discard_prealloc, which has nothing
to do with written data.
In looking at nfs, nfs_file_release does write out the data by calling
filemap_fdatawrite(), but it seems somewhat unique in that regard, but I
guess we are talking about NFS here. :)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 22:27 ` Shaya Potter
@ 2005-11-10 22:40 ` Trond Myklebust
2005-11-11 0:12 ` Bryan Henderson
0 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-10 22:40 UTC (permalink / raw)
To: Shaya Potter; +Cc: Dave Kleikamp, nfsv4, fsdevel
On Thu, 2005-11-10 at 17:27 -0500, Shaya Potter wrote:
> In looking at nfs, nfs_file_release does write out the data by calling
> filemap_fdatawrite(), but it seems somewhat unique in that regard, but I
> guess we are talking about NFS here. :)
It should hardly come as a newsflash that remote filesystems are
inherently different to local filesystems.
If the user disappears without leaving a credential for us to use then
writing out those pages would be a real PITA. ...and credentials are
cached in the struct file.
That is why we flush the data when we do.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 22:40 ` Trond Myklebust
@ 2005-11-11 0:12 ` Bryan Henderson
2005-11-11 1:30 ` Brad Boyer
` (2 more replies)
0 siblings, 3 replies; 36+ messages in thread
From: Bryan Henderson @ 2005-11-11 0:12 UTC (permalink / raw)
To: Trond Myklebust; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
>It should hardly come as a newsflash that remote filesystems are
>inherently different to local filesystems.
You'd have to give a pretty specific definition of remote filesystem
before I'd agree with that. At its most basic level, remote just means
distant, and the matter of needing a credential to access a file has more
to do with the fact that the filesystem is shared than that it is distant.
>If the user disappears without leaving a credential for us to use then
>writing out those pages would be a real PITA. ...and credentials are
>cached in the struct file.
>
>That is why we flush the data when we do.
That's a workaround, right? A workaround of the fact that the Linux
buffer cache doesn't have a way to buffer credentials along with buffered
write data?
I'm a little fuzzy on how that works anyway, since there's usually a
shared cache -- the same file data cache page can be dirtied via multiple
mmaps, struct files, and users.
I've always been irritated by the fact that filesystem drivers see struct
file at all. struct file ought to live in a higher
filesystem-type-independent layer, with the filesystem driver seeing just
inodes. Why are credentials cached in the struct file? Is that a natural
place for it or just what's available?
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 0:12 ` Bryan Henderson
@ 2005-11-11 1:30 ` Brad Boyer
2005-11-11 2:06 ` Trond Myklebust
2005-11-11 16:40 ` Nikita Danilov
2 siblings, 0 replies; 36+ messages in thread
From: Brad Boyer @ 2005-11-11 1:30 UTC (permalink / raw)
To: Bryan Henderson
Cc: Trond Myklebust, fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
On Thu, Nov 10, 2005 at 04:12:25PM -0800, Bryan Henderson wrote:
> Why are credentials cached in the struct file? Is that a natural
> place for it or just what's available?
I don't have intimate knowledge of the details, but I suspect it's
because credentials more naturally map onto a file than an inode.
For example, if you have two users opening the exact same path from
an NFS mount, there is just one struct inode, but there are at least
two different struct file (one for each open call). The credentials
are tied to the user session, not to the path. Because of this, you
may have multiple unrelated credentials associated with what is
the same struct inode under the covers.
Brad Boyer
flar@allandria.com
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 0:12 ` Bryan Henderson
2005-11-11 1:30 ` Brad Boyer
@ 2005-11-11 2:06 ` Trond Myklebust
2005-11-11 18:18 ` Bryan Henderson
2005-11-11 16:40 ` Nikita Danilov
2 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 2:06 UTC (permalink / raw)
To: Bryan Henderson; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
On Thu, 2005-11-10 at 16:12 -0800, Bryan Henderson wrote:
> >It should hardly come as a newsflash that remote filesystems are
> >inherently different to local filesystems.
>
> You'd have to give a pretty specific definition of remote filesystem
> before I'd agree with that. At its most basic level, remote just means
> distant, and the matter of needing a credential to access a file has more
> to do with the fact that the filesystem is shared than that it is distant.
Show me a remote filesystem that doesn't have some form of
authentication.
> >If the user disappears without leaving a credential for us to use then
> >writing out those pages would be a real PITA. ...and credentials are
> >cached in the struct file.
> >
> >That is why we flush the data when we do.
>
> That's a workaround, right? A workaround of the fact that the Linux
> buffer cache doesn't have a way to buffer credentials along with buffered
> write data?
No. It is an inherent feature of shared mmapped files that the pages can
be written to by different users. When the VM finally gets round to
flushing them out, all it knows is that this page is dirty.
This is pretty much a generic problem on all mmap implementations. It
would be extremely difficult to tag each and every access to a page with
a credential (memcpy would end up being a very slow operation).
> I'm a little fuzzy on how that works anyway, since there's usually a
> shared cache -- the same file data cache page can be dirtied via multiple
> mmaps, struct files, and users.
Right.
> I've always been irritated by the fact that filesystem drivers see struct
> file at all. struct file ought to live in a higher
> filesystem-type-independent layer, with the filesystem driver seeing just
> inodes.
Whether it irritates you or not, that is NEVER going to happen.
Generic filesystems can, and usually do, have private open state that
they need to carry around. Even local filesystems. For an example, just
look at something like the ext3 readdir implementation.
You may also have noticed, that MVFS claimed to have the same
requirement...
> Why are credentials cached in the struct file? Is that a natural
> place for it or just what's available?
POSIX mandates that you check for access on open(). If your task later
goes suid, then that is not allowed to have any consequences for
existing open files.
For instance, we used to have a problem in NFS with things like the
command "mount -av > file.out" because on starting up, mount would suid,
then try to write to the already open file using root credentials
instead of the user credentials that bash had opened file.out with.
These credentials also have to follow functions like dup(), fork(),
etc....
Now that we have NFSv4, we also have to track things like the stateids
that represent the current locking state on the server. If your process
happens to have the file open, and has set a mandatory lock, it would be
very annoying if your WRITE was refused because you has lost the stateid
that tells the server that your process is the one that holds the
mandatory lock.
So yes... All this fits naturally in the struct file, which is the
structure that the kernel uses to track your POSIX file
descriptor-specific state.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:35 ` John T. Kohl
2005-11-10 21:40 ` Shaya Potter
2005-11-10 21:50 ` Christoph Hellwig
@ 2005-11-11 2:31 ` Trond Myklebust
2005-11-11 4:04 ` Trond Myklebust
2005-11-14 15:56 ` David Howells
3 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 2:31 UTC (permalink / raw)
To: John T. Kohl, dhowells; +Cc: nfsv4, fsdevel
On Thu, 2005-11-10 at 16:35 -0500, John T. Kohl wrote:
> Let me explain a bit more what's going on here. MVFS would like to do
> the same thing that CODA does. In the file->mmap() operation, CODA and
> MVFS want to set up paging operations to be handled by the backing store
> inode. See for example fs/coda/file.c:coda_file_mmap(), it sets
> coda_inode->i_mapping = host_inode->i_mapping.
>
> But this fails when host_inode is an NFS inode. NFS assumes
> that when it gets paging operations, it can look at the file pointer
> passed to the address_space_operations' readpage function, and that file
> pointer will be for an open NFS file. If NFS is a backing store inode,
> the file pointer is for the stacked file system's open file.
This will not change. We have per-file descriptor state we need to track
in order to work correctly.
> CODA certainly won't work today with NFS host inodes and mapped files.
> I'm not surprised nobody noticed, since that seems like a poor way to
> use CODA. Using NFS backing store is a primary use case for ClearCase
> MVFS, so we noticed.
It sounds to me like you want to talk to the cachefs folks. They too
need special hooks in the NFS low-level page cache routines in order to
be able to mirror write requests to the local backing store and/or
reroute read requests to that backing store.
David?
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 2:31 ` Trond Myklebust
@ 2005-11-11 4:04 ` Trond Myklebust
2005-11-11 13:45 ` John T. Kohl
0 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 4:04 UTC (permalink / raw)
To: John T. Kohl; +Cc: dhowells, fsdevel, nfsv4
On Thu, 2005-11-10 at 21:32 -0500, Trond Myklebust wrote:
> It sounds to me like you want to talk to the cachefs folks. They too
> need special hooks in the NFS low-level page cache routines in order to
> be able to mirror write requests to the local backing store and/or
> reroute read requests to that backing store.
Note: I'm not saying that you should special case Clearcase for NFS, but
if both you and cachefs have similar requirements for hooks, then
perhaps we could look for a common solution (perhaps at the VFS level?).
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 4:04 ` Trond Myklebust
@ 2005-11-11 13:45 ` John T. Kohl
2005-11-11 15:27 ` Charles P. Wright
0 siblings, 1 reply; 36+ messages in thread
From: John T. Kohl @ 2005-11-11 13:45 UTC (permalink / raw)
To: Trond Myklebust, dhowells, nfsv4, fsdevel
>>>>> "Trond" == Trond Myklebust <trondmy@trondhjem.org> writes:
Trond> On Thu, 2005-11-10 at 21:32 -0500, Trond Myklebust wrote:
>> It sounds to me like you want to talk to the cachefs folks. They too
>> need special hooks in the NFS low-level page cache routines in order to
>> be able to mirror write requests to the local backing store and/or
>> reroute read requests to that backing store.
Trond> Note: I'm not saying that you should special case Clearcase for NFS, but
Trond> if both you and cachefs have similar requirements for hooks, then
Trond> perhaps we could look for a common solution (perhaps at the VFS level?).
Thanks for the encouragement.
It looks to me like the i_mapping and f_mapping stuff is intended to let
a stacking file system share pages with a backing-store file system (we
really want to share pages, it's efficient and avoids a whole host of
cache coherency problems), but the interfaces are not adequate for that
to work with NFS as the backing-store.
Other than i_mapping/f_mapping, I don't think it's possible right now
for stacking file systems to handle the address_space operations in our
layer *and* share the same pages with the backing-store, since the struct
pages are attached to the address space via file->f_mapping.
So yeah, if NFS or other file systems need to have a file pointer for
its paging operations, sounds like we need some changes in the VM/file
system interfaces to provide page sharing for stacking file systems.
[Special-casing for NFS would be tricky and probably improper--should we
really care what's below us? How would we determine that our backing
store inode is an NFS inode (or any other sort that doesn't handle
i_mapping hosting)? We don't have access to the NFS symbol names for
the file_operations or address_space_operations, so we can't even cheat
and determine whether the object below us is NFS.
So in essence the i_mapping/f_mapping stuff is not really fully usable,
unless the stacking file system "knows" that the backing store is safe.]
--
John Kohl
Senior Software Engineer - Rational Software - IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
<http://www.ibm.com/software/rational/>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 13:45 ` John T. Kohl
@ 2005-11-11 15:27 ` Charles P. Wright
2005-11-11 17:38 ` John T. Kohl
0 siblings, 1 reply; 36+ messages in thread
From: Charles P. Wright @ 2005-11-11 15:27 UTC (permalink / raw)
To: John T. Kohl; +Cc: Trond Myklebust, dhowells, nfsv4, fsdevel
On Fri, 2005-11-11 at 08:45 -0500, John T. Kohl wrote:
> >>>>> "Trond" == Trond Myklebust <trondmy@trondhjem.org> writes:
>
> Trond> On Thu, 2005-11-10 at 21:32 -0500, Trond Myklebust wrote:
> >> It sounds to me like you want to talk to the cachefs folks. They too
> >> need special hooks in the NFS low-level page cache routines in order to
> >> be able to mirror write requests to the local backing store and/or
> >> reroute read requests to that backing store.
>
> Trond> Note: I'm not saying that you should special case Clearcase for NFS, but
> Trond> if both you and cachefs have similar requirements for hooks, then
> Trond> perhaps we could look for a common solution (perhaps at the VFS level?).
>
> Thanks for the encouragement.
>
> It looks to me like the i_mapping and f_mapping stuff is intended to let
> a stacking file system share pages with a backing-store file system (we
> really want to share pages, it's efficient and avoids a whole host of
> cache coherency problems), but the interfaces are not adequate for that
> to work with NFS as the backing-store.
>
> Other than i_mapping/f_mapping, I don't think it's possible right now
> for stacking file systems to handle the address_space operations in our
> layer *and* share the same pages with the backing-store, since the struct
> pages are attached to the address space via file->f_mapping.
At Stony Brook, we've come across similar problems. It is relatively
easy to double cache, but inefficient. It is also relatively easy to
single-cache, but then you don't get to intercept any of these
interesting operations. Getting both at once is tricky. Nikolai Joukov
developed a method that he uses for Tracefs, with pointer flipping.
Basically, we set the page mapping to the lower-level mapping before the
oepration, and unsets it afterwards.
> [Special-casing for NFS would be tricky and probably improper--should we
> really care what's below us? How would we determine that our backing
> store inode is an NFS inode (or any other sort that doesn't handle
> i_mapping hosting)? We don't have access to the NFS symbol names for
> the file_operations or address_space_operations, so we can't even cheat
> and determine whether the object below us is NFS.
One way to check is !strcmp(i_mapping->host->i_sb->s_type->name, "nfs").
We used this in Unionfs because NFS doesn't returns EACCESS instead of
EROFS for read-only file systems.
Charles
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 0:12 ` Bryan Henderson
2005-11-11 1:30 ` Brad Boyer
2005-11-11 2:06 ` Trond Myklebust
@ 2005-11-11 16:40 ` Nikita Danilov
2005-11-11 18:45 ` Bryan Henderson
2 siblings, 1 reply; 36+ messages in thread
From: Nikita Danilov @ 2005-11-11 16:40 UTC (permalink / raw)
To: Bryan Henderson; +Cc: fsdevel, Shaya Potter, nfsv4, Dave Kleikamp
Bryan Henderson writes:
[...]
>
> I've always been irritated by the fact that filesystem drivers see struct
> file at all. struct file ought to live in a higher
> filesystem-type-independent layer, with the filesystem driver seeing just
> inodes.
This is how *BSD VFS works, and it is a pain in the neck. For example,
implementing file-system-type specific read-ahead algorithm requires
jumping through all kinds of loops, because such algorithm works with
data (read-ahead window parameters) that naturally tend to be stored in
the file descriptor, and the latter is invisible to the file-system
code.
> Why are credentials cached in the struct file? Is that a natural
> place for it or just what's available?
>
> --
> Bryan Henderson IBM Almaden Research Center
> San Jose CA Filesystems
Nikita.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 15:27 ` Charles P. Wright
@ 2005-11-11 17:38 ` John T. Kohl
0 siblings, 0 replies; 36+ messages in thread
From: John T. Kohl @ 2005-11-11 17:38 UTC (permalink / raw)
To: Charles P. Wright; +Cc: Trond Myklebust, dhowells, nfsv4, fsdevel
>>>>> "Charles" == Charles P Wright <cwright@cs.sunysb.edu> writes:
Charles> On Fri, 2005-11-11 at 08:45 -0500, John T. Kohl wrote:
>> Other than i_mapping/f_mapping, I don't think it's possible right now
>> for stacking file systems to handle the address_space operations in our
>> layer *and* share the same pages with the backing-store, since the struct
>> pages are attached to the address space via file->f_mapping.
Charles> At Stony Brook, we've come across similar problems. It is relatively
Charles> easy to double cache, but inefficient. It is also relatively easy to
Charles> single-cache, but then you don't get to intercept any of these
Charles> interesting operations. Getting both at once is tricky.
We currently do single-caching, by passing on the mmap operation to the
backing store (swapping in the backing store file for vma->vm_file).
(We do the equivalent in our MVFS built for vnode kernels.) Swapping
the vm file is mostly workable, but we do have to be a bit too
knowledgable about the innards of file mapping and do some things to
accomodate the actions taken after fop->mmap is called.
However, it does mean that things like /proc/<pid>/exe show the
backing-store file name not the upper-level name. That screws up some
programs like Java which use /proc/self/exe to find their environment,
since our backing-store directory layout is nothing like the upper-level
layout.
--
John Kohl
Senior Software Engineer - Rational Software - IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
<http://www.ibm.com/software/rational/>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 2:06 ` Trond Myklebust
@ 2005-11-11 18:18 ` Bryan Henderson
2005-11-11 19:22 ` Trond Myklebust
0 siblings, 1 reply; 36+ messages in thread
From: Bryan Henderson @ 2005-11-11 18:18 UTC (permalink / raw)
To: Trond Myklebust; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
>> >It should hardly come as a newsflash that remote filesystems are
>> >inherently different to local filesystems.
>>
>> You'd have to give a pretty specific definition of remote filesystem
>> before I'd agree with that. At its most basic level, remote just means
>> distant, and the matter of needing a credential to access a file has
more
>> to do with the fact that the filesystem is shared than that it is
distant.
>
>Show me a remote filesystem that doesn't have some form of
>authentication.
An ordinary NFS filesystem is remote and does not have authentication. I'm
sure you mean identification (Identification is saying who is writing the
data; authentication is proving it is he). In NFS, authentication is done
by the client operating system and in a Linux client it's totally outside
of the filesystem function. Identification is something local filesystems
do as well. What sets NFS apart here is that the identification happens
at physical write (cache clean) time instead of just at open time. That's
not an inherent part of being remote (distant). In fact, I don't even
know a word, other than NFS, for the class of filesystems that have this
characteristic.
>It is an inherent feature of shared mmapped files that the pages can
>be written to by different users. When the VM finally gets round to
>flushing them out, all it knows is that this page is dirty.
>> I'm a little fuzzy on how that works anyway, ...
You acknowledge this burning question without answering it, and I'd really
like to understand. How do you determine at pageout time what credential
to give the NFS server? I think you said it has to do with credentials
cached in the struct file, but the same way you can't attach a credential
to the dirty page, you can't attach a struct file to it, right? And is it
just a shared mmap problem, or is it the same thing if multiple users
simultaneously write() to the file cache?
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 16:40 ` Nikita Danilov
@ 2005-11-11 18:45 ` Bryan Henderson
2005-11-11 19:31 ` Nikita Danilov
0 siblings, 1 reply; 36+ messages in thread
From: Bryan Henderson @ 2005-11-11 18:45 UTC (permalink / raw)
To: Nikita Danilov; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
> > I've always been irritated by the fact that filesystem drivers see
struct
> > file at all. struct file ought to live in a higher
> > filesystem-type-independent layer, with the filesystem driver seeing
just
> > inodes.
>
>This is how *BSD VFS works, and it is a pain in the neck. For example,
>implementing file-system-type specific read-ahead algorithm requires
>jumping through all kinds of loops,
Layering is always a pain in the neck. Disk device designers are pained
by the fact that they can't see the struct file, not to mention the block
maps, and therefore can't do optimal head scheduling. Some Linux
application designers are pained by the fact that they can't tell where on
the disk the bytes of a file live, so they can't plan their accesses
better. But there are of course plenty of advantages to modular design.
In the model where struct file lives in one layer and struct inode lives
in another and they don't mix, the readahead you're talking about belongs
in the higher layer. A distinct advantage of that is that it works for
all filesystem types without duplication of code. The disadvantage is
that to do optimal readahead you also have to know stuff only the
filesystem driver knows. The compromise is an additional interface that
allows a narrow set of readahead related information to be exchanged
across the layers (which is, of course, a pain in the neck).
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 18:18 ` Bryan Henderson
@ 2005-11-11 19:22 ` Trond Myklebust
2005-11-11 21:57 ` Bryan Henderson
0 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 19:22 UTC (permalink / raw)
To: Bryan Henderson; +Cc: fsdevel, Shaya Potter, nfsv4, Dave Kleikamp
On Fri, 2005-11-11 at 10:18 -0800, Bryan Henderson wrote:
> >> >It should hardly come as a newsflash that remote filesystems are
> >> >inherently different to local filesystems.
> >>
> >> You'd have to give a pretty specific definition of remote filesystem
> >> before I'd agree with that. At its most basic level, remote just means
>
> >> distant, and the matter of needing a credential to access a file has
> more
> >> to do with the fact that the filesystem is shared than that it is
> distant.
> >
> >Show me a remote filesystem that doesn't have some form of
> >authentication.
>
> An ordinary NFS filesystem is remote and does not have authentication. I'm
> sure you mean identification (Identification is saying who is writing the
> data; authentication is proving it is he). In NFS, authentication is done
> by the client operating system and in a Linux client it's totally outside
> of the filesystem function.
I mean authentication. The act of proving that a given remote procedure
call is being sent on behalf of a given authorised individual.
That is precisely what an RPCSEC_GSS session allows by virtue of a
secure per-user channel which has been set up using some standard strong
authentication method (krb5 being currently the most commonly used such
method).
Even the old and untrusty AUTH_SYS (still the NFS default) does some
limited form of authentication: the server identifies the client, checks
if the client is on a trusted list, then reads off the RPC's user+group
information. It's not particularly secure, but it is authentication.
> Identification is something local filesystems
> do as well. What sets NFS apart here is that the identification happens
> at physical write (cache clean) time instead of just at open time. That's
> not an inherent part of being remote (distant). In fact, I don't even
> know a word, other than NFS, for the class of filesystems that have this
> characteristic.
As long as you're prepared to group filesystems such as AFS/DFS, CIFS,
etc under the NFS umbrella.
> >It is an inherent feature of shared mmapped files that the pages can
> >be written to by different users. When the VM finally gets round to
> >flushing them out, all it knows is that this page is dirty.
> >> I'm a little fuzzy on how that works anyway, ...
>
> You acknowledge this burning question without answering it, and I'd really
> like to understand. How do you determine at pageout time what credential
> to give the NFS server? I think you said it has to do with credentials
> cached in the struct file, but the same way you can't attach a credential
> to the dirty page, you can't attach a struct file to it, right? And is it
> just a shared mmap problem, or is it the same thing if multiple users
> simultaneously write() to the file cache?
Ordinary writes go through the prepare_write()/commit_write() interface,
and so we tag them with the appropriate credentials + state there. We
don't bother to tag the pages with the "PG_dirty" bit 'cos we don't want
the VM to cycle them through the writepage() interface. Instead we track
the page state ourselves.
The only place weirdness can come from is mmap(), since there we are at
the mercy of the limitations of the VM's dirty page tracking.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 18:45 ` Bryan Henderson
@ 2005-11-11 19:31 ` Nikita Danilov
2005-11-11 19:42 ` Trond Myklebust
2005-11-11 23:13 ` Bryan Henderson
0 siblings, 2 replies; 36+ messages in thread
From: Nikita Danilov @ 2005-11-11 19:31 UTC (permalink / raw)
To: Bryan Henderson; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
Bryan Henderson writes:
[...]
> >
> >This is how *BSD VFS works, and it is a pain in the neck. For example,
> >implementing file-system-type specific read-ahead algorithm requires
> >jumping through all kinds of loops,
>
> Layering is always a pain in the neck. Disk device designers are pained
But well-thought layering is much less so.
I don't understand why struct file belongs to the VFS and struct inode
to the file system driver. It is not that struct file is more abstract
and more generic representation for the same entity as inode. struct
file represents "usage handle" for the file system object (represented
by struct inode), and file system driver has direct interest in working
with such handles.
What one may argue, on the other hand, is that struct file should be
split into generic and file system specific part, with the low level
code only using the latter.
[...]
Nikita.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 19:31 ` Nikita Danilov
@ 2005-11-11 19:42 ` Trond Myklebust
2005-11-11 23:13 ` Bryan Henderson
1 sibling, 0 replies; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 19:42 UTC (permalink / raw)
To: Nikita Danilov
Cc: fsdevel, Shaya Potter, nfsv4, Bryan Henderson, Dave Kleikamp
On Fri, 2005-11-11 at 22:31 +0300, Nikita Danilov wrote:
> What one may argue, on the other hand, is that struct file should be
> split into generic and file system specific part, with the low level
> code only using the latter.
This would be acceptable (and indeed we already do something like this
in the low-level NFS read/write code), but would it really make much of
a difference to stackable filesystems? Tracking the full struct file
should be no more difficult than tracking the file->private_data.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 19:22 ` Trond Myklebust
@ 2005-11-11 21:57 ` Bryan Henderson
2005-11-11 22:41 ` Trond Myklebust
0 siblings, 1 reply; 36+ messages in thread
From: Bryan Henderson @ 2005-11-11 21:57 UTC (permalink / raw)
To: Trond Myklebust; +Cc: fsdevel, nfsv4, Dave Kleikamp, Shaya Potter
>I mean authentication. The act of proving that a given remote procedure
>call is being sent on behalf of a given authorised individual.
Well then plain NFS (the classic kind without RPCSEC_GSS; the only kind I
have ever been close to) is a great example of a remote filesystem type
(by just about any definition) that doesn't have authentication.
>Even the old and untrusty AUTH_SYS (still the NFS default) does some
>limited form of authentication: the server identifies the client, checks
>if the client is on a trusted list, then reads off the RPC's user+group
>information.
I don't see anything here where someone proves that the user is who the
RPC says. The server just assumes he is because he trusts the client. And
no matter what your reason for trusting the client, just trusting the
claimant isn't authentication. Incidentally, the server -- in the most
basic configuration -- doesn't authenticate the _client_ either. He
assumes that if the IP packets say they're from IP address 1.2.3.4, then
they are.
>What sets NFS apart here is that the identification happens
>> at physical write (cache clean) time instead of just at open time.
That's
>> not an inherent part of being remote (distant). In fact, I don't even
>> know a word, other than NFS, for the class of filesystems that have
this
>> characteristic.
>
>As long as you're prepared to group filesystems such as AFS/DFS, CIFS,
>etc under the NFS umbrella.
I don't know these very well, but CIFS maintains a connection between
client and server that spans the placing of the write data in the cache
and the cleaning of the cache, and both identifies and authenticates the
user at connection time. So I assume it does not have to identify the
user when it writes an individual page from cache to the server.
>Ordinary writes go through the prepare_write()/commit_write() interface,
>and so we tag them with the appropriate credentials + state there.
This is the part I don't follow. Where does that tag go? How does the
code that eventually sends that data to the NFS server get the
credentials?
>The only place weirdness can come from is mmap(), since there we are at
>the mercy of the limitations of the VM's dirty page tracking.
I hate to push, but: So what does the NFS driver do for that?
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 21:57 ` Bryan Henderson
@ 2005-11-11 22:41 ` Trond Myklebust
2005-11-14 19:02 ` Bryan Henderson
0 siblings, 1 reply; 36+ messages in thread
From: Trond Myklebust @ 2005-11-11 22:41 UTC (permalink / raw)
To: Bryan Henderson; +Cc: fsdevel, Shaya Potter, nfsv4, Dave Kleikamp
On Fri, 2005-11-11 at 13:57 -0800, Bryan Henderson wrote:
> >I mean authentication. The act of proving that a given remote procedure
> >call is being sent on behalf of a given authorised individual.
>
> Well then plain NFS (the classic kind without RPCSEC_GSS; the only kind I
> have ever been close to) is a great example of a remote filesystem type
> (by just about any definition) that doesn't have authentication.
>
> >Even the old and untrusty AUTH_SYS (still the NFS default) does some
> >limited form of authentication: the server identifies the client, checks
> >if the client is on a trusted list, then reads off the RPC's user+group
> >information.
>
> I don't see anything here where someone proves that the user is who the
> RPC says. The server just assumes he is because he trusts the client. And
> no matter what your reason for trusting the client, just trusting the
> claimant isn't authentication. Incidentally, the server -- in the most
> basic configuration -- doesn't authenticate the _client_ either. He
> assumes that if the IP packets say they're from IP address 1.2.3.4, then
> they are.
There is always a chain of trust in any authentication scheme.
When using kerberos for login authentication, you are basically trusting
an IP address too: afaik the kdc doesn't do anything to authenticate
itself to you.
In the case of AUTH_SYS, the client is playing the role of the "kdc" and
is being trusted to authenticate users to the server (but not itself).
> >What sets NFS apart here is that the identification happens
> >> at physical write (cache clean) time instead of just at open time.
> That's
> >> not an inherent part of being remote (distant). In fact, I don't even
> >> know a word, other than NFS, for the class of filesystems that have
> this
> >> characteristic.
> >
> >As long as you're prepared to group filesystems such as AFS/DFS, CIFS,
> >etc under the NFS umbrella.
>
> I don't know these very well, but CIFS maintains a connection between
> client and server that spans the placing of the write data in the cache
> and the cleaning of the cache, and both identifies and authenticates the
> user at connection time. So I assume it does not have to identify the
> user when it writes an individual page from cache to the server.
This is the exact same thing that an RPCSEC_GSS session does, except
RPCSEC_GSS allows you to multiplex the various "connections" over a
single TCP socket.
> >Ordinary writes go through the prepare_write()/commit_write() interface,
> >and so we tag them with the appropriate credentials + state there.
>
> This is the part I don't follow. Where does that tag go? How does the
> code that eventually sends that data to the NFS server get the
> credentials?
Each dirty page is tagged by a single "struct nfs_page" (see
include/linux/nfs_page.h) that tracks its NFS state. Those nfs_pages are
placed on the appropriate list (dirty, commit) in the struct rpc_inode
to allow us to track dirty state on a per inode basis.
> >The only place weirdness can come from is mmap(), since there we are at
> >the mercy of the limitations of the VM's dirty page tracking.
>
> I hate to push, but: So what does the NFS driver do for that?
As explained in the very beginning of this thread, we try to make sure
that the dirty pages get flushed out using one of the open files from
the vm_area_struct.
There is a greater margin for data loss when you do this (someone on the
server may have revoked the access permissions for the user that owns
the struct file we chose, but not the one that actually wrote the page)
but that can't be helped.
Cheers,
Trond
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 19:31 ` Nikita Danilov
2005-11-11 19:42 ` Trond Myklebust
@ 2005-11-11 23:13 ` Bryan Henderson
1 sibling, 0 replies; 36+ messages in thread
From: Bryan Henderson @ 2005-11-11 23:13 UTC (permalink / raw)
To: Nikita Danilov; +Cc: fsdevel, Shaya Potter, nfsv4, Dave Kleikamp
>It is not that struct file is more abstract
>and more generic representation for the same entity as inode. struct
>file represents "usage handle" for the file system object (represented
>by struct inode), and file system driver has direct interest in working
>with such handles.
If it were _just_ a usage handle, it would be nothing more than a pointer
to the inode. But it is an abstraction: the purpose of the struct file
historically is to represent a stream, which is built on top of a file
image. The file image is a random access thing -- you say "give me Byte
6000." The stream is serial -- you say "give me the next byte". The
stream also has security, whereas the underlying file image doesn't, and a
few other features the raw file image does not have, such as the ability
to sync automatically on writes.
Ordinarily, any time you split the view of something into multiple views,
you're talking about adding a layer. When we split a block device into
files, we add a filesystem layer. When we split a file into rows we add a
database layer. When we split an IP link into multiple conversations, we
add the TCP, UDP, etc. layer. Splitting a file image into multiple
streams looks like the same thing to me.
The stream _is_ for the most part generic. While a VFS read does vastly
different things on a ramfs filesystem vs ext2, the stream read code is
identical for both. A ramfs or ext2 engineer should be able to (and
mostly can) ignore the existence of struct file.
>What one may argue, on the other hand, is that struct file should be
>split into generic and file system specific part, with the low level
>code only using the latter.
That means adding a concept of sessions down there that doesn't exist
today (I've seen people try to make it exist, only to run against a wall
when they find the NFS server does a VFS read without ever having done a
VFS open). I wouldn't mind that, but if it were the case, I would surely
divide any filesystem driver I write into two layers -- the upper one with
sessions and the lower stateless one! And the NFS server would still need
a sessionless interface.
I haven't really followed how access to struct file enables NFS to do its
thing, but I do know that in systems I've seen where the layers are
strictly separated, the VFS operations have a credential as an argument.
That probably helps a lot with stacking, since the credential would be on
the, well, stack.
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
@ 2005-11-14 0:44 Nikolai Joukov
2005-11-14 16:02 ` David Howells
2005-11-14 16:11 ` John T. Kohl
0 siblings, 2 replies; 36+ messages in thread
From: Nikolai Joukov @ 2005-11-14 0:44 UTC (permalink / raw)
To: John T. Kohl
Cc: Trond Myklebust, dhowells, nfsv4, Charles Wright, linux-fsdevel
>> Charles> On Fri, 2005-11-11 at 08:45 -0500, John T. Kohl wrote:
>>> Other than i_mapping/f_mapping, I don't think it's possible right now
>>> for stacking file systems to handle the address_space operations in our
>>> layer *and* share the same pages with the backing-store, since the struct
>>> pages are attached to the address space via file->f_mapping.
> Charles> At Stony Brook, we've come across similar problems. It is relatively
> Charles> easy to double cache, but inefficient. It is also relatively easy to
> Charles> single-cache, but then you don't get to intercept any of these
> Charles> interesting operations. Getting both at once is tricky.
>
> We currently do single-caching, by passing on the mmap operation to the
> backing store (swapping in the backing store file for vma->vm_file).
> (We do the equivalent in our MVFS built for vnode kernels.) Swapping
> the vm file is mostly workable, but we do have to be a bit too
> knowledgable about the innards of file mapping and do some things to
> accomodate the actions taken after fop->mmap is called.
What we are discussing here is only the tip of the iceberg. We are
discussing the simplest case:
1) Page N of the upper filesystem's file corresponds to page N of the
lower file.
2) No page processing is necessary.
In that case we use either the technique described above
(http://lxr.fsl.cs.sunysb.edu/fistgen/source/templates/Linux-2.6/file.c#L458)
or CODA's i_mapping/f_mapping way. Both have their own problems.
However, for most stackable filesystems we need to intercept the
writepage/readpage/prepare_write/commit_write operations. Also, page N of
the upper filesystem may correspond to some other page M of the lower
filesystem. For example, this is the case for fan-out stackable
filesystems (Unionfs, and RAID-like filesystems). There are dozens of
practical stackable filesystems were we have to double-cache only because
Linux VFS does not allow to intercept the page-based operations *and*
avoid double caching. I would like to point out here that neither *BSD
nor Windows stackable filesystems have this problem. To solve the
problem, VFS should allow stackable filesystems to 1) do something
(calculate checksumms, calculate parity for filesystem-level RAIDs, etc.)
inside of a stackable filesystem's readpage/writepage/..., 2) call lower
filesystem's readpage/writepage/... passing *any* page to these lower
functions. The page passed below may be a page of the lower filesystem
to get double caching, or an upper page to get no double caching. Here
is a kludge that works in many cases:
int stackable_readpage(file_t *file, page_t *page)
{
...
page->mapping = lower_inode->i_mapping;
err = lower_inode->i_mapping->a_ops->readpage(lower_file, page);
page->mapping = inode->i_mapping;
...
}
All structures with the 'lower_' prefix belong to the lower filesystem.
It doesn't seem to be the exactly right way to go but it provides the same
flexibility for Linux stackable filesystems that they enjoy in *BSD and
Windows. A correct implementation requires some isolation of the page
structure from the file/dentry/inode added at the VFS level. However, it
would be sufficient if we can make the code above work in all the cases
over all existing filesystems.
Sincerely,
Nikolai Joukov.
**************************************
* Ph.D. student (Advisor: Erez Zadok)
* File systems and Storage Laboratory
* Stony Brook University (SUNY)
**************************************
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-10 21:35 ` John T. Kohl
` (2 preceding siblings ...)
2005-11-11 2:31 ` Trond Myklebust
@ 2005-11-14 15:56 ` David Howells
3 siblings, 0 replies; 36+ messages in thread
From: David Howells @ 2005-11-14 15:56 UTC (permalink / raw)
To: Trond Myklebust; +Cc: John T. Kohl, dhowells, nfsv4, fsdevel
Trond Myklebust <trond.myklebust@fys.uio.no> wrote:
> > CODA certainly won't work today with NFS host inodes and mapped files.
> > I'm not surprised nobody noticed, since that seems like a poor way to
> > use CODA. Using NFS backing store is a primary use case for ClearCase
> > MVFS, so we noticed.
>
> It sounds to me like you want to talk to the cachefs folks. They too
> need special hooks in the NFS low-level page cache routines in order to
> be able to mirror write requests to the local backing store and/or
> reroute read requests to that backing store.
>
> David?
There are a number reasons I don't want to use i_mapping redirection to
support caching, as nice as it may seem to do that:
(1) Most filesystems don't do hole reportage. Holes in files are treated as
blocks of zeros and can't be distinguished otherwise.
(2) The backing inode must be fully populated before being exposed
to userspace through the main inode because the VM/VFS goes directly to
the backing inode and does not interrogate the front inode on VM ops.
Therefore:
(a) The backing inode must fit entirely within the cache.
(b) All backed files currently open must fit entirely within the cache at
the same time.
(c) A working set of files in total larger than the cache may not be
cached.
(d) A file may not grow larger than the available space in the cache.
(e) A file that's open and cached, and remotely grows larger than the
cache is potentially stuffed.
(3) Writes go to the backing filesystem, and can only be transferred to the
network when the file is closed.
(4) There's no record of what changes have been made, so the whole file must
be written back.
(5) The pages belong to the backing filesystem, and all metadata associated
with that page are relevant only to the backing filesystem, and not
anything stacked atop it.
Reading through i_mapping is fun, especially when a normal filesystem is used:
(1) You cannot, for the most part, detect holes, and so you can't use holes
to denote as-yet unfetched blocks.
(2) You don't want a page attached to the netfs that has a duplicate attached
to the backing fs.
(3) It isn't possible to share a page between two filesystems. Both of them
tend to attempt to assert control over the metadata of the page.
What I do with FS-Cache/CacheFS is to say that the netfs owns the page, and
that the cache will read or write the netfs's page directly. The cache will
assume that a block it has not yet been given (a hole) is data not yet
retrieved from the network.
Writing through i_mapping is also fun, particularly if you have shared
writable mappings available.
(1) With shared-mmap you don't know what's changed.
(2) With write you can at least determine what's changed, though it may be
tricky to keep track of what has been written to the cache yet.
(3) You can't use prepare_write and commit_write... they belong to the
underlying FS.
(4) You may have to write the entire file back if it's been changed.
With FS-Cache/CacheFS the pages belong to the netfs. We use a second page bit
(PG_fs_misc) to keep track of data being written to the cache in addition to
PG_writeback - which tracks data being written to the network.
The big problem is that a page cannot belong to several filesystems at once,
and cannot hold metadata for those filesystems all at the same time.
David
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-14 0:44 [RFC] Support for stackable file systems on top of nfs Nikolai Joukov
@ 2005-11-14 16:02 ` David Howells
2005-11-14 20:48 ` Erez Zadok
2005-11-14 16:11 ` John T. Kohl
1 sibling, 1 reply; 36+ messages in thread
From: David Howells @ 2005-11-14 16:02 UTC (permalink / raw)
To: Nikolai Joukov
Cc: John T. Kohl, Trond Myklebust, dhowells, nfsv4, Charles Wright,
linux-fsdevel
Nikolai Joukov <kolya@cs.sunysb.edu> wrote:
> int stackable_readpage(file_t *file, page_t *page)
> {
> ...
> page->mapping = lower_inode->i_mapping;
> err = lower_inode->i_mapping->a_ops->readpage(lower_file, page);
> page->mapping = inode->i_mapping;
> ...
> }
This is a really bad idea for a number of reasons:
(1) page->mapping isn't the only metadata in the page.
(2) The lower_inode may interpret a hold as a block of zeros rather than
bouncing the request back to the higher inode with ENODATA or something.
(3) The lower_inode readpage() may not be complete at the time you switch the
mapping pointer back. Obviously the page will be locked until at such
time as completion or an error occurs.
(4) The lower_inode readpage() may complete before it returns, in which case
the VM may go and do something unspeakable to that page whilst it's still
got the wrong mapping attached.
(5) The lower_inode may have attached its own metadata to page->private, and
this may refer back to this page, and may subsequently trip an assertion
because the page's mapping pointer has been corrupted.
David
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-14 0:44 [RFC] Support for stackable file systems on top of nfs Nikolai Joukov
2005-11-14 16:02 ` David Howells
@ 2005-11-14 16:11 ` John T. Kohl
1 sibling, 0 replies; 36+ messages in thread
From: John T. Kohl @ 2005-11-14 16:11 UTC (permalink / raw)
To: kolya; +Cc: Trond Myklebust, dhowells, nfsv4, Charles Wright, linux-fsdevel
>>>>> "Nikolai" == Nikolai Joukov <kolya@cs.sunysb.edu> writes:
Nikolai> Here is a kludge that works in many cases:
Nikolai> int stackable_readpage(file_t *file, page_t *page)
Nikolai> {
Nikolai> ...
Nikolai> page->mapping = lower_inode->i_mapping;
Nikolai> err = lower_inode->i_mapping->a_ops->readpage(lower_file, page);
Nikolai> page->mapping = inode->i_mapping;
Nikolai> ...
Nikolai> }
What are the locking requirements to do this safely? Are the proper
locks held (or safely grabbable during readpage()) to avoid a race with
another paging operation?
I see a troubling comment in do_generic_mapping_read():
/* ... and start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
but I've not studied the page locking design to know for sure that this
example is race-free.
--
John Kohl
Senior Software Engineer
Rational Software
IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
The opinions expressed in this message do not reflect the views of my
employer.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-11 22:41 ` Trond Myklebust
@ 2005-11-14 19:02 ` Bryan Henderson
0 siblings, 0 replies; 36+ messages in thread
From: Bryan Henderson @ 2005-11-14 19:02 UTC (permalink / raw)
To: Trond Myklebust; +Cc: fsdevel, Shaya Potter, nfsv4, Dave Kleikamp
>> I don't see anything here where someone proves that the user is who the
>> RPC says. The server just assumes he is because he trusts the client.
And
>> no matter what your reason for trusting the client, just trusting the
>> claimant isn't authentication. Incidentally, the server -- in the most
>> basic configuration -- doesn't authenticate the _client_ either. He
>> assumes that if the IP packets say they're from IP address 1.2.3.4,
then
>> they are.
>
>There is always a chain of trust in any authentication scheme.
Agreed. But the converse is not true. The fact that there's a chain of
trust in it doesn't mean it's an authentication scheme. If you trust the
person handing you the credentials (even if they're for someone else),
you're not authenticating anything. If you take them to a third party you
trust and say, "are these real?", that's authentication.
I can see calling it an act of authentication where the NFS server has
multiple forms of authentication and explicitly checks and determines that
AUTH_SYS is in use and therefore no further authentication is required.
But that's an implementation thing. In the protocol, and from the NFS
client's perspective, there is no authentication.
Remember that I'm specifically talking about the case of classic NFS here.
I'm using it as an example of a type of remote filesystem that does not
do authentication. You claimed that remote filesystems are fundamentally
different and so must follow different rules in Linux, and gave as a
distinguishing characteristic of "remote filesystems" that they do
authentication.
This terminology dispute is actually moot, though, because there are
filesystems I would call remote that don't do anything similar to what
classic NFS does in the area of user identities, no matter what you call
it. I think if you want to identify a property of NFS that entitles it to
different consideration from something like ext2, I'd say the fact that
it's a shared filesystem would be a good one. But then, maybe that's part
of your definition of remote.
>> >As long as you're prepared to group filesystems such as AFS/DFS, CIFS,
>> >etc under the NFS umbrella.
>>
>> I don't know these very well, but CIFS maintains a connection between
>> client and server that spans the placing of the write data in the cache
>> and the cleaning of the cache, and both identifies and authenticates
the
>> user at connection time. So I assume it does not have to identify the
>> user when it writes an individual page from cache to the server.
>
>This is the exact same thing that an RPCSEC_GSS session does, except
>RPCSEC_GSS allows you to multiplex the various "connections" over a
>single TCP socket.
Then NFS with RPCSEC_GSS is another filesystem type I would not include
under the "NFS umbrella." (Again, I've been talking about classic NFS
only). It is only that requirement to identify the user when you write an
individual page from cache to the server that causes the complexities
we've been talking about. (Reminder of context: I had said that this is
not a characteristic of remote filesystems or of any other class of
filesystems for which I know a name other than "NFS").
--
Bryan Henderson IBM Almaden Research Center
San Jose CA Filesystems
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-14 16:02 ` David Howells
@ 2005-11-14 20:48 ` Erez Zadok
2005-11-14 21:13 ` John T. Kohl
0 siblings, 1 reply; 36+ messages in thread
From: Erez Zadok @ 2005-11-14 20:48 UTC (permalink / raw)
To: David Howells
Cc: Nikolai Joukov, John T. Kohl, Trond Myklebust, nfsv4,
Charles Wright, linux-fsdevel
In message <17963.1131984133@warthog.cambridge.redhat.com>, David Howells writes:
> Nikolai Joukov <kolya@cs.sunysb.edu> wrote:
>
> > int stackable_readpage(file_t *file, page_t *page)
> > {
> > ...
> > page->mapping = lower_inode->i_mapping;
> > err = lower_inode->i_mapping->a_ops->readpage(lower_file, page);
> > page->mapping = inode->i_mapping;
> > ...
> > }
>
> This is a really bad idea for a number of reasons:
>
> (1) page->mapping isn't the only metadata in the page.
>
> (2) The lower_inode may interpret a hold as a block of zeros rather than
> bouncing the request back to the higher inode with ENODATA or something.
>
> (3) The lower_inode readpage() may not be complete at the time you switch the
> mapping pointer back. Obviously the page will be locked until at such
> time as completion or an error occurs.
>
> (4) The lower_inode readpage() may complete before it returns, in which case
> the VM may go and do something unspeakable to that page whilst it's still
> got the wrong mapping attached.
>
> (5) The lower_inode may have attached its own metadata to page->private, and
> this may refer back to this page, and may subsequently trip an assertion
> because the page's mapping pointer has been corrupted.
>
> David
David, we fully agree. That's why we don't like this page-flipping hack,
even if "it works" for us under limited conditions. We'd much rather fix
the problem properly. Maybe now that there appears to be others in need of
something similar, we can discuss on this list possible ways to fix this in
the vfs/mm proper?
One idea we considered is that no one above the file system (vfs or mm)
should dereference the page->mapping directly, but go through a new f/s
method. The default method will do what the code does now (extract the
stuffed inode from inside the mapping). But any file system that wants to,
could override this method with its own version which handles stacking
and/or nfs as needed.
Will this work? Will it give us the functionality we need? Or would we
need a single page to possibly point back to a _chain_ of inodes (which
would start looking like Skinner's "pvnode" '93 idea)?
We haven't explored the full code to see if this idea will work out. So
we'd like to start some discussion as to what would it take to support such
functionality cleanly. We'd be happy to work with anyone on a fix for this
issue, and also come up with patches to test and (eventually) submit for
inclusion in the mainline kernel.
Thanks,
Erez.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-14 20:48 ` Erez Zadok
@ 2005-11-14 21:13 ` John T. Kohl
2005-11-14 21:32 ` Jamie Lokier
0 siblings, 1 reply; 36+ messages in thread
From: John T. Kohl @ 2005-11-14 21:13 UTC (permalink / raw)
To: Erez Zadok
Cc: nfsv4, Nikolai Joukov, David Howells, Charles Wright,
linux-fsdevel, Trond Myklebust
>>>>> "Erez" == Erez Zadok <ezk@cs.sunysb.edu> writes:
Erez> Will this work? Will it give us the functionality we need? Or would we
Erez> need a single page to possibly point back to a _chain_ of inodes (which
Erez> would start looking like Skinner's "pvnode" '93 idea)?
Also have a look at Heidemann's PhD thesis,
http://www.isi.edu:80/people/johnh/PAPERS/Heidemann95e.html
--
John Kohl
Senior Software Engineer - Rational Software - IBM Software Group
Lexington, Massachusetts, USA
jtk@us.ibm.com
<http://www.ibm.com/software/rational/>
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC] Support for stackable file systems on top of nfs
2005-11-14 21:13 ` John T. Kohl
@ 2005-11-14 21:32 ` Jamie Lokier
0 siblings, 0 replies; 36+ messages in thread
From: Jamie Lokier @ 2005-11-14 21:32 UTC (permalink / raw)
To: John T. Kohl
Cc: Erez Zadok, David Howells, Nikolai Joukov, Trond Myklebust, nfsv4,
Charles Wright, linux-fsdevel
John T. Kohl wrote:
> >>>>> "Erez" == Erez Zadok <ezk@cs.sunysb.edu> writes:
>
> Erez> Will this work? Will it give us the functionality we need?
> Or would we Erez> need a single page to possibly point back to a
> _chain_ of inodes (which Erez> would start looking like Skinner's
> "pvnode" '93 idea)?
>
> Also have a look at Heidemann's PhD thesis,
> http://www.isi.edu:80/people/johnh/PAPERS/Heidemann95e.html
This looks like it might also have relevance to "copy-on-write" copied
files - multiple inodes, on the same filesystem, sharing pages in
memory as well as on disk. Good for virtual machines sharing files, etc.
A key feature of COW files is the need to mmap() them, and for
modifications to one to not affect the mapped view of the other.
Pages attached to multiple inodes would be a way of implementing that.
I don't have time to write more, so I'm just putting out the
observation.
-- Jamie
^ permalink raw reply [flat|nested] 36+ messages in thread
end of thread, other threads:[~2005-11-14 21:34 UTC | newest]
Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-11-14 0:44 [RFC] Support for stackable file systems on top of nfs Nikolai Joukov
2005-11-14 16:02 ` David Howells
2005-11-14 20:48 ` Erez Zadok
2005-11-14 21:13 ` John T. Kohl
2005-11-14 21:32 ` Jamie Lokier
2005-11-14 16:11 ` John T. Kohl
-- strict thread matches above, loose matches on Subject: below --
2005-11-10 17:32 Dave Kleikamp
2005-11-10 20:07 ` Christoph Hellwig
2005-11-10 21:35 ` John T. Kohl
2005-11-10 21:40 ` Shaya Potter
2005-11-10 21:57 ` John T. Kohl
2005-11-10 21:50 ` Christoph Hellwig
2005-11-11 2:31 ` Trond Myklebust
2005-11-11 4:04 ` Trond Myklebust
2005-11-11 13:45 ` John T. Kohl
2005-11-11 15:27 ` Charles P. Wright
2005-11-11 17:38 ` John T. Kohl
2005-11-14 15:56 ` David Howells
2005-11-10 21:24 ` Trond Myklebust
2005-11-10 21:36 ` Shaya Potter
2005-11-10 22:18 ` Trond Myklebust
2005-11-10 22:27 ` Shaya Potter
2005-11-10 22:40 ` Trond Myklebust
2005-11-11 0:12 ` Bryan Henderson
2005-11-11 1:30 ` Brad Boyer
2005-11-11 2:06 ` Trond Myklebust
2005-11-11 18:18 ` Bryan Henderson
2005-11-11 19:22 ` Trond Myklebust
2005-11-11 21:57 ` Bryan Henderson
2005-11-11 22:41 ` Trond Myklebust
2005-11-14 19:02 ` Bryan Henderson
2005-11-11 16:40 ` Nikita Danilov
2005-11-11 18:45 ` Bryan Henderson
2005-11-11 19:31 ` Nikita Danilov
2005-11-11 19:42 ` Trond Myklebust
2005-11-11 23:13 ` Bryan Henderson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).