* [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] <1262913974.2659.101.camel@localhost>
@ 2010-01-09 0:56 ` Trond Myklebust
2010-01-09 0:56 ` [RFC PATCH 2/2] NFS: Fix a potential deadlock in nfs_file_mmap() Trond Myklebust
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
0 siblings, 2 replies; 14+ messages in thread
From: Trond Myklebust @ 2010-01-09 0:56 UTC (permalink / raw)
To: Andi Kleen, Linus Torvalds; +Cc: linux-kernel, linux-nfs
How about something like the following. I chose to wrap the call to
do_mmap_pgoff() instead of making a special ->pre_mmap(), since that
seems more consistent with the way we handle ->read() and ->write().
I also left sys_uselib() and do_execve() to rely on revalidate at
open(), since executables and libraries really are not ever expected to
change while open.
Cheers
Trond
---
Trond Myklebust (2):
NFS: Fix a potential deadlock in nfs_file_mmap()
VFS: Add a mmap_file() callback to struct file_operations
fs/nfs/file.c | 28 ++++++++++++++++++++++------
fs/nfs/inode.c | 4 ++++
include/linux/fs.h | 5 +++++
mm/filemap.c | 23 +++++++++++++++++++++++
mm/mmap.c | 11 ++++++++---
mm/nommu.c | 11 ++++++++---
6 files changed, 70 insertions(+), 12 deletions(-)
--
Signature
^ permalink raw reply [flat|nested] 14+ messages in thread
* [RFC PATCH 1/2] VFS: Add a mmap_file() callback to struct file_operations
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-09 0:56 ` Trond Myklebust
2010-01-09 1:17 ` [RFC PATCH 0/2] Fix up the NFS mmap code Linus Torvalds
2010-01-10 2:00 ` Andi Kleen
2 siblings, 0 replies; 14+ messages in thread
From: Trond Myklebust @ 2010-01-09 0:56 UTC (permalink / raw)
To: Andi Kleen, Linus Torvalds; +Cc: linux-kernel, linux-nfs
Add a helper function to allow the NFS filesystem to hook mmap() system
calls and do the necessary page cache revalidation before passing control
to the VM layer.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
include/linux/fs.h | 5 +++++
mm/filemap.c | 23 +++++++++++++++++++++++
mm/mmap.c | 11 ++++++++---
mm/nommu.c | 11 ++++++++---
4 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..5d66b16 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1504,6 +1504,9 @@ struct file_operations {
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **);
+ unsigned long (*mmap_pgoff)(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
};
struct inode_operations {
@@ -2191,6 +2194,8 @@ extern int set_blocksize(struct block_device *, int);
extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);
+extern unsigned long generic_file_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long, unsigned long);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0..f7717b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1594,6 +1594,29 @@ const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
};
+/**
+ * generic_file_mmap_pgoff - generic filesystem mmap_pgoff routine
+ * @file: file to mmap
+ * @addr: memory address to map to
+ * @len: length of the mapping
+ * @prot: memory protection flags
+ * @flags: mapping type
+ * @pgoff: starting page offset
+ */
+unsigned long generic_file_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long retval;
+
+ down_write(&mm->mmap_sem);
+ retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&mm->mmap_sem);
+ return retval;
+}
+EXPORT_SYMBOL(generic_file_mmap_pgoff);
+
/* This is used for a general mmap of a disk file */
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
diff --git a/mm/mmap.c b/mm/mmap.c
index ee22989..3931811 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1047,6 +1047,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
+ unsigned long (*func)(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
struct file *file = NULL;
unsigned long retval = -EBADF;
@@ -1073,9 +1076,11 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(¤t->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(¤t->mm->mmap_sem);
+ if (file && file->f_op && file->f_op->mmap_pgoff)
+ func = file->f_op->mmap_pgoff;
+ else
+ func = generic_file_mmap_pgoff;
+ retval = func(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1777386..4e31cb2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1407,6 +1407,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
+ unsigned long (*func)(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
struct file *file = NULL;
unsigned long retval = -EBADF;
@@ -1418,9 +1421,11 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(¤t->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(¤t->mm->mmap_sem);
+ if (file && file->f_op && file->f_op->mmap_pgoff)
+ func = file->f_op->mmap_pgoff;
+ else
+ func = generic_file_mmap_pgoff;
+ retval = func(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [RFC PATCH 2/2] NFS: Fix a potential deadlock in nfs_file_mmap()
2010-01-09 0:56 ` [RFC PATCH 0/2] Fix up the NFS mmap code Trond Myklebust
@ 2010-01-09 0:56 ` Trond Myklebust
[not found] ` <20100109005624.7473.15560.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
1 sibling, 1 reply; 14+ messages in thread
From: Trond Myklebust @ 2010-01-09 0:56 UTC (permalink / raw)
To: Andi Kleen, Linus Torvalds; +Cc: linux-kernel, linux-nfs
We cannot call nfs_invalidate_mapping() inside file->f_ops->mmap(), since
this would cause us to grab the inode->i_mutex while already holding the
current->mm->mmap_sem (thus causing a potential ABBA deadlock with the file
write code, which can grab those locks in the opposite order).
We can fix this situation for the mmap() system call by using the new
mmap_pgoff() callback, which is called prior to taking the
current->mm->mmap_sem mutex.
We also add ensure that open() invalidates the mapping if the inode data is
stale so that other users of mmap() (mainly the exec and uselib system
calls) get up to date data too.
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
fs/nfs/file.c | 28 ++++++++++++++++++++++------
fs/nfs/inode.c | 4 ++++
2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..723e254 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -42,6 +42,9 @@ static int nfs_file_open(struct inode *, struct file *);
static int nfs_file_release(struct inode *, struct file *);
static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
static int nfs_file_mmap(struct file *, struct vm_area_struct *);
+static unsigned long nfs_file_mmap_pgoff(struct file * file,
+ unsigned long addr, unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff);
static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t count, unsigned int flags);
@@ -78,6 +81,7 @@ const struct file_operations nfs_file_operations = {
.splice_write = nfs_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
+ .mmap_pgoff = nfs_file_mmap_pgoff,
};
const struct inode_operations nfs_file_inode_operations = {
@@ -290,24 +294,36 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
static int
nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
int status;
dprintk("NFS: mmap(%s/%s)\n",
- dentry->d_parent->d_name.name, dentry->d_name.name);
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name);
/* Note: generic_file_mmap() returns ENOSYS on nommu systems
* so we call that before revalidating the mapping
*/
status = generic_file_mmap(file, vma);
- if (!status) {
+ if (!status)
vma->vm_ops = &nfs_file_vm_ops;
- status = nfs_revalidate_mapping(inode, file->f_mapping);
- }
return status;
}
+static unsigned long
+nfs_file_mmap_pgoff(struct file * file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ int status;
+
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
+ if (status < 0)
+ return status;
+
+ return generic_file_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+}
+
/*
* Flush any dirty pages for this process, and check for write errors.
* The return status from this call provides a reliable indication of
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..90e1d67 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -56,6 +56,8 @@
static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
+static int nfs_invalidate_mapping(struct inode *inode,
+ struct address_space *mapping);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
static struct kmem_cache * nfs_inode_cachep;
@@ -694,6 +696,8 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_file_set_open_context(filp, ctx);
put_nfs_open_context(ctx);
nfs_fscache_set_inode_cookie(inode, filp);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_invalidate_mapping(inode, filp->f_mapping);
return 0;
}
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 0:56 ` [RFC PATCH 1/2] VFS: Add a mmap_file() callback to struct file_operations Trond Myklebust
@ 2010-01-09 1:17 ` Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081709470.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-10 2:00 ` Andi Kleen
2 siblings, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2010-01-09 1:17 UTC (permalink / raw)
To: Trond Myklebust; +Cc: Andi Kleen, linux-kernel, linux-nfs
On Fri, 8 Jan 2010, Trond Myklebust wrote:
>
> How about something like the following. I chose to wrap the call to
> do_mmap_pgoff() instead of making a special ->pre_mmap(), since that
> seems more consistent with the way we handle ->read() and ->write().
I still don't think that you can ever do mmap _and_ readdir on the same
inode, so there's something wrong with the lockdep annotations.
See my earlier mail about putting directory inodes in a different class
from non-directory inodes, and the email after that that points out that
we already do:
- inode_init_always():
lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
- unlock_new_inode() (for directories):
lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
and I suspect that the reason why NFS triggers lockdep problems is that
NFS probably plays some game with inodes (presumably the root inode) that
ends up bypassing the normal new-inode handling.
In short - I really don't think this issue merits VFS-level (or VM,
whatever you want to call it) hooks. There's a bug there, but I don't
think you're actually fixing the right thing.
And _especially_ not the way you did it, where the filesystem can wrap the
whole complex do_mmap_pgoff. The NFS use you have doesn't seem too bad,
but anybody who tries to be clever and avoid "generic_mmap_do_pgoff()"
would immediately be a major disaster.
Linus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] ` <alpine.LFD.2.00.1001081709470.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-09 1:38 ` Al Viro
2010-01-09 1:46 ` Al Viro
2010-01-09 1:57 ` Linus Torvalds
0 siblings, 2 replies; 14+ messages in thread
From: Al Viro @ 2010-01-09 1:38 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Fri, Jan 08, 2010 at 05:17:14PM -0800, Linus Torvalds wrote:
>
>
> On Fri, 8 Jan 2010, Trond Myklebust wrote:
> >
> > How about something like the following. I chose to wrap the call to
> > do_mmap_pgoff() instead of making a special ->pre_mmap(), since that
> > seems more consistent with the way we handle ->read() and ->write().
>
> I still don't think that you can ever do mmap _and_ readdir on the same
> inode, so there's something wrong with the lockdep annotations.
readdir() is certainly a red herring. write(), OTOH, is quite real.
And there we do i_mutex followed by pagefaults.
I *REALLY* dislike Trond's solution, though.
Could we please get a sane expalanation of the reasons why nfs mmap
wants i_mutex in the first place? Before we add yet another hook
from hell and complicate already overcomplicated area...
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
2010-01-09 1:38 ` Al Viro
@ 2010-01-09 1:46 ` Al Viro
2010-01-09 1:57 ` Linus Torvalds
1 sibling, 0 replies; 14+ messages in thread
From: Al Viro @ 2010-01-09 1:46 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Sat, Jan 09, 2010 at 01:38:25AM +0000, Al Viro wrote:
> On Fri, Jan 08, 2010 at 05:17:14PM -0800, Linus Torvalds wrote:
> >
> >
> > On Fri, 8 Jan 2010, Trond Myklebust wrote:
> > >
> > > How about something like the following. I chose to wrap the call to
> > > do_mmap_pgoff() instead of making a special ->pre_mmap(), since that
> > > seems more consistent with the way we handle ->read() and ->write().
> >
> > I still don't think that you can ever do mmap _and_ readdir on the same
> > inode, so there's something wrong with the lockdep annotations.
>
> readdir() is certainly a red herring. write(), OTOH, is quite real.
> And there we do i_mutex followed by pagefaults.
>
> I *REALLY* dislike Trond's solution, though.
>
> Could we please get a sane expalanation of the reasons why nfs mmap
> wants i_mutex in the first place? Before we add yet another hook
> from hell and complicate already overcomplicated area...
PS: mmap/write deadlock is real, AFAICT - unless something very subtle
prevents it, we can get buggered if we have two threads with the same
VM, mmap 1.4Mb from floppy and do
thread A:
write(fd_on_nfs, buffer_mmaped_from_floppy, 1440 * 1024);
thread B:
mmap(..., fd_on_nfs, ...)
It's not even particulary narrow.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 2/2] NFS: Fix a potential deadlock in nfs_file_mmap()
[not found] ` <20100109005624.7473.15560.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-09 1:54 ` Al Viro
0 siblings, 0 replies; 14+ messages in thread
From: Al Viro @ 2010-01-09 1:54 UTC (permalink / raw)
To: Trond Myklebust; +Cc: Andi Kleen, Linus Torvalds, linux-kernel, linux-nfs
On Fri, Jan 08, 2010 at 07:56:24PM -0500, Trond Myklebust wrote:
> We cannot call nfs_invalidate_mapping() inside file->f_ops->mmap(), since
> this would cause us to grab the inode->i_mutex while already holding the
> current->mm->mmap_sem (thus causing a potential ABBA deadlock with the file
> write code, which can grab those locks in the opposite order).
>
> We can fix this situation for the mmap() system call by using the new
> mmap_pgoff() callback, which is called prior to taking the
> current->mm->mmap_sem mutex.
>
> We also add ensure that open() invalidates the mapping if the inode data is
> stale so that other users of mmap() (mainly the exec and uselib system
> calls) get up to date data too.
> + status = nfs_revalidate_mapping(inode, file->f_mapping);
> + if (status < 0)
> + return status;
> +
> + return generic_file_mmap_pgoff(file, addr, len, prot, flags, pgoff);
This is completely bogus. Why do you need i_mutex for that and what
the <expletives> does that really prevent? You might wait for a _loong_
time waiting for that mmap_sem, so what is really going on there?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
2010-01-09 1:38 ` Al Viro
2010-01-09 1:46 ` Al Viro
@ 2010-01-09 1:57 ` Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081750080.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
1 sibling, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2010-01-09 1:57 UTC (permalink / raw)
To: Al Viro; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Sat, 9 Jan 2010, Al Viro wrote:
>
> readdir() is certainly a red herring.
That's the one that lockdep reports, though. I still don't see why. Afaik,
the only place where NFS gets an inode is nfs_fhget(), and that seems to
do things correctly.
> write(), OTOH, is quite real. And there we do i_mutex followed by
> pagefaults.
Ho humm..
> Could we please get a sane expalanation of the reasons why nfs mmap
> wants i_mutex in the first place? Before we add yet another hook
> from hell and complicate already overcomplicated area...
Yeah, agreed.
Linus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] ` <alpine.LFD.2.00.1001081750080.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-09 2:11 ` Al Viro
2010-01-09 2:22 ` Linus Torvalds
0 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2010-01-09 2:11 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Fri, Jan 08, 2010 at 05:57:27PM -0800, Linus Torvalds wrote:
>
>
> On Sat, 9 Jan 2010, Al Viro wrote:
> >
> > readdir() is certainly a red herring.
>
> That's the one that lockdep reports, though. I still don't see why. Afaik,
> the only place where NFS gets an inode is nfs_fhget(), and that seems to
> do things correctly.
Well, sure - it steps on i_mutex-before-mmmap_sem first from ls somewhere and
records the ordering for posterity. Then NFS steps into mmap() (on a
different inode) and gets conflicting ordering.
It would be a false positive if rules for NFS *really* had been different
and it could safely grab i_mutex on NFS inodes inside mmap_sem. It can't.
The rules really are the same. And readdir is just the earliest case of
kernel stepping on mmap_sem while holding *some* i_mutex. write() is
another and there i_mutex can very well be the same as in case of mmap().
lockdep doesn't make a distinction (and really, how many paths reinforcing
the normal lock ordering would you record?), but if we'd given i_mutex of
NFS regular files a class of its own, we'd see a warning with nfs write
instead of readdir...
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
2010-01-09 2:11 ` Al Viro
@ 2010-01-09 2:22 ` Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081814240.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
0 siblings, 1 reply; 14+ messages in thread
From: Linus Torvalds @ 2010-01-09 2:22 UTC (permalink / raw)
To: Al Viro; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Sat, 9 Jan 2010, Al Viro wrote:
>
> Well, sure - it steps on i_mutex-before-mmmap_sem first from ls somewhere and
> records the ordering for posterity. Then NFS steps into mmap() (on a
> different inode) and gets conflicting ordering.
Look closer: the inodes for directories and for non-directories have
i_mutex in different lockdep classes.
So that "on a different inode" thing should have made it a non-issue,
since there is no actual chain back. There is "mmap_sem ->
i_mutex_regular_file" (for mmap) and there is "i_mutex_directory ->
mmap_sem" (for filldir), but that isn't an ABBA.
The problem _seems_ to be (if I read Andi's chain correctly) that a
directory hasn't gone through the i_mutex_dir_key change, so filldir ends
up being counted against the default i_mutex_key.
Linus
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] ` <alpine.LFD.2.00.1001081814240.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-09 2:30 ` Al Viro
2010-01-09 2:40 ` Al Viro
0 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2010-01-09 2:30 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Fri, Jan 08, 2010 at 06:22:01PM -0800, Linus Torvalds wrote:
>
>
> On Sat, 9 Jan 2010, Al Viro wrote:
> >
> > Well, sure - it steps on i_mutex-before-mmmap_sem first from ls somewhere and
> > records the ordering for posterity. Then NFS steps into mmap() (on a
> > different inode) and gets conflicting ordering.
>
> Look closer: the inodes for directories and for non-directories have
> i_mutex in different lockdep classes.
>
> So that "on a different inode" thing should have made it a non-issue,
> since there is no actual chain back. There is "mmap_sem ->
> i_mutex_regular_file" (for mmap) and there is "i_mutex_directory ->
> mmap_sem" (for filldir), but that isn't an ABBA.
>
> The problem _seems_ to be (if I read Andi's chain correctly) that a
> directory hasn't gone through the i_mutex_dir_key change, so filldir ends
> up being counted against the default i_mutex_key.
Interesting... There's nfs_update_inode(), but it ought to scream bloody
murder on the type change (and its return value is ignore by nfs_fhget(),
BTW).
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
2010-01-09 2:30 ` Al Viro
@ 2010-01-09 2:40 ` Al Viro
2010-01-09 2:43 ` Al Viro
0 siblings, 1 reply; 14+ messages in thread
From: Al Viro @ 2010-01-09 2:40 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Sat, Jan 09, 2010 at 02:30:35AM +0000, Al Viro wrote:
> Interesting... There's nfs_update_inode(), but it ought to scream bloody
> murder on the type change (and its return value is ignore by nfs_fhget(),
> BTW).
Uh-oh... Just what will happen if we get FATTR_MODE without FATTR_TYPE,
BTW? AFAICS, the check for changed type won't trigger, but we will
get type bits in i_mode set to 0. Probably unrelated, but...
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
2010-01-09 2:40 ` Al Viro
@ 2010-01-09 2:43 ` Al Viro
0 siblings, 0 replies; 14+ messages in thread
From: Al Viro @ 2010-01-09 2:43 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Trond Myklebust, Andi Kleen, linux-kernel, linux-nfs
On Sat, Jan 09, 2010 at 02:40:05AM +0000, Al Viro wrote:
> On Sat, Jan 09, 2010 at 02:30:35AM +0000, Al Viro wrote:
>
> > Interesting... There's nfs_update_inode(), but it ought to scream bloody
> > murder on the type change (and its return value is ignore by nfs_fhget(),
> > BTW).
>
> Uh-oh... Just what will happen if we get FATTR_MODE without FATTR_TYPE,
> BTW? AFAICS, the check for changed type won't trigger, but we will
> get type bits in i_mode set to 0. Probably unrelated, but...
Definitely unrelated - we'll get i_mode buggered, but i_op remains set.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC PATCH 0/2] Fix up the NFS mmap code
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 0:56 ` [RFC PATCH 1/2] VFS: Add a mmap_file() callback to struct file_operations Trond Myklebust
2010-01-09 1:17 ` [RFC PATCH 0/2] Fix up the NFS mmap code Linus Torvalds
@ 2010-01-10 2:00 ` Andi Kleen
2 siblings, 0 replies; 14+ messages in thread
From: Andi Kleen @ 2010-01-10 2:00 UTC (permalink / raw)
To: Trond Myklebust; +Cc: Andi Kleen, Linus Torvalds, linux-kernel, linux-nfs
On Fri, Jan 08, 2010 at 07:56:24PM -0500, Trond Myklebust wrote:
> How about something like the following. I chose to wrap the call to
> do_mmap_pgoff() instead of making a special ->pre_mmap(), since that
> seems more consistent with the way we handle ->read() and ->write().
I tested the patch and I don't see the lockdep problem anymore
and seems to work so far on the test system with some quick tests.
-Andi
--
ak@linux.intel.com -- Speaking for myself only.
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2010-01-10 2:00 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1262913974.2659.101.camel@localhost>
2010-01-09 0:56 ` [RFC PATCH 0/2] Fix up the NFS mmap code Trond Myklebust
2010-01-09 0:56 ` [RFC PATCH 2/2] NFS: Fix a potential deadlock in nfs_file_mmap() Trond Myklebust
[not found] ` <20100109005624.7473.15560.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 1:54 ` Al Viro
[not found] ` <20100109005624.7473.33215.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 0:56 ` [RFC PATCH 1/2] VFS: Add a mmap_file() callback to struct file_operations Trond Myklebust
2010-01-09 1:17 ` [RFC PATCH 0/2] Fix up the NFS mmap code Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081709470.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 1:38 ` Al Viro
2010-01-09 1:46 ` Al Viro
2010-01-09 1:57 ` Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081750080.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 2:11 ` Al Viro
2010-01-09 2:22 ` Linus Torvalds
[not found] ` <alpine.LFD.2.00.1001081814240.7821-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-09 2:30 ` Al Viro
2010-01-09 2:40 ` Al Viro
2010-01-09 2:43 ` Al Viro
2010-01-10 2:00 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox