[PATCH] nfsd: don't hold i_mutex over userspace upcalls

linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
@ 2016-01-07 21:08 J. Bruce Fields
  0 siblings, 0 replies; 7+ messages in thread
From: J. Bruce Fields @ 2016-01-07 21:08 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-nfs, linux-fsdevel

From: NeilBrown <neilb@suse.de>

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

Could we get this reviewed for 4.5?

--b.

diff --git a/fs/namei.c b/fs/namei.c
index d84d7c7515fc..174ef4f106cd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2279,6 +2279,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2322,6 +2324,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
@ 2015-04-29 19:19 J. Bruce Fields
  2015-04-29 21:52 ` NeilBrown
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-04-29 19:19 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

On Wed, Apr 29, 2015 at 12:57:28PM +1000, NeilBrown wrote:
> In any case, holding a mutex while waiting for an upcall is probably a bad
> idea.  We should try to find a way to work around that...

Yeah, it sounds fragile even if we could fix this particular issue in
mountd.

> Any ideas Bruce ?-)

Looking at nfsd_buffered_readdir():

	/*
	 * Various filldir functions may end up calling back into
	 * lookup_one_len() and the file system's ->lookup() method.
	 * These expect i_mutex to be held, as it would within readdir.
	 */
        host_err = mutex_lock_killable(&dir_inode->i_mutex);

and as far as I can tell Kinglong's approach (adding an unlock and lock
around nfsd_cross_mnt() calls might actually be OK.

Though in general I expect

	lock()
	...code...
	unlock()

to mark a critical section and would be unpleasantly surprised to
discover that a function somewhere in the middle had a buried
unlock/lock pair.

Maybe drop the locking from nfsd_buffered_readdir and *just* take the
i_mutex around lookup_one_len(), if that's the only place we need it?

--b.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-04-29 19:19 [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root J. Bruce Fields
@ 2015-04-29 21:52 ` NeilBrown
  2015-04-30 21:36   ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: NeilBrown @ 2015-04-29 21:52 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 3061 bytes --]

On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Wed, Apr 29, 2015 at 12:57:28PM +1000, NeilBrown wrote:
> > In any case, holding a mutex while waiting for an upcall is probably a bad
> > idea.  We should try to find a way to work around that...
> 
> Yeah, it sounds fragile even if we could fix this particular issue in
> mountd.
> 
> > Any ideas Bruce ?-)
> 
> Looking at nfsd_buffered_readdir():
> 
> 	/*
> 	 * Various filldir functions may end up calling back into
> 	 * lookup_one_len() and the file system's ->lookup() method.
> 	 * These expect i_mutex to be held, as it would within readdir.
> 	 */
>         host_err = mutex_lock_killable(&dir_inode->i_mutex);
> 
> and as far as I can tell Kinglong's approach (adding an unlock and lock
> around nfsd_cross_mnt() calls might actually be OK.

Yes, I think now that it would work.  It would look odd though, as you
suggest.
There would need to be very clear comments explaining why the lock is being
managed that way.

> 
> Though in general I expect
> 
> 	lock()
> 	...code...
> 	unlock()
> 
> to mark a critical section and would be unpleasantly surprised to
> discover that a function somewhere in the middle had a buried
> unlock/lock pair.
> 
> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> i_mutex around lookup_one_len(), if that's the only place we need it?

I think it is the only place needed.  Certainly normal path lookup only takes
i_mutex very briefly around __lookup_hash().

One cost would be that we would take the mutex for every name instead of once
for a whole set of names.  That is generally frowned upon for performance
reasons.

An alternative might be to stop using lookup_one_len, and use kern_path()
instead.  Or kern_path_mountpoint if we don't want to cross a mountpoint.

They would only take the i_mutex if absolutely necessary.

The draw back is that they don't take a 'len' argument, so we would need to
make sure the name  is nul terminated (and maybe double-check that there is
no '/'??).  It would be fairly easy to nul-terminate names from readdir -
just use a bit more space  in the buffer in nfsd_buffered_filldir().

I'm less sure about the locking needed in nfsd_lookup_dentry().
The comments suggests that there is good reason to keep the lock for an
extended period.  But that reasoning might only apply to files, and
nfsd_mountpoint should  only be true for directories... I hope.

A different approach would be to pass NULL for the rqstp to nfsd_cross_mnt().
This will ensure it doesn't block, but it might fail incorrectly.
If it does fail, we drop the lock, retry with the real rqstp, retake the lock
and .... no, I think that gets a bit too messy.

I think I'm in favour of:
 - not taking the lock in readdir
 - maybe not taking it in lookup
 - using kern_path_mountpoint or kern_path
 - nul terminating then names, copying the nfsd_lookup name to
   __getname() if necessary.

Sound reasonable?

NeilBrown

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-04-29 21:52 ` NeilBrown
@ 2015-04-30 21:36   ` J. Bruce Fields
  2015-05-01  1:53     ` NeilBrown
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-04-30 21:36 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> wrote:
> > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > i_mutex around lookup_one_len(), if that's the only place we need it?
> 
> I think it is the only place needed.  Certainly normal path lookup
> only takes i_mutex very briefly around __lookup_hash().
> 
> One cost would be that we would take the mutex for every name instead
> of once for a whole set of names.  That is generally frowned upon for
> performance reasons.
> 
> An alternative might be to stop using lookup_one_len, and use
> kern_path() instead.  Or kern_path_mountpoint if we don't want to
> cross a mountpoint.
> 
> They would only take the i_mutex if absolutely necessary.
> 
> The draw back is that they don't take a 'len' argument, so we would
> need to make sure the name  is nul terminated (and maybe double-check
> that there is no '/'??).  It would be fairly easy to nul-terminate
> names from readdir - just use a bit more space  in the buffer in
> nfsd_buffered_filldir().

They're also a lot more complicated than lookup_one_len.  Is any of this
extra stuff they do (audit?) going to bite us?

For me understanding that would be harder than writing a variant of
lookup_one_len that took the i_mutex when required.  But I guess that's
my problem, I should try to understand the lookup code better....

> I'm less sure about the locking needed in nfsd_lookup_dentry().  The
> comments suggests that there is good reason to keep the lock for an
> extended period.  But that reasoning might only apply to files, and
> nfsd_mountpoint should  only be true for directories... I hope.

I thought d_mountpoint() could be true for files, but certainly we won't
be doing nfs4 opens on directories.

> A different approach would be to pass NULL for the rqstp to
> nfsd_cross_mnt().  This will ensure it doesn't block, but it might
> fail incorrectly.  If it does fail, we drop the lock, retry with the
> real rqstp, retake the lock and .... no, I think that gets a bit too
> messy.

Yes.

> I think I'm in favour of:
>  - not taking the lock in readdir
>  - maybe not taking it in lookup
>  - using kern_path_mountpoint or kern_path
>  - nul terminating then names, copying the nfsd_lookup name to
>    __getname() if necessary.
> 
> Sound reasonable?

I guess so, though I wish I understood kern_path_mountpoint better.

And nfsd_lookup_dentry looks like work for another day.  No, wait, I
forgot the goal here: you're right, nfsd_lookup_dentry has the same
upcall-under-i_mutex problem, so we need to fix it too.

OK, agreed.

--b.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-04-30 21:36   ` J. Bruce Fields
@ 2015-05-01  1:53     ` NeilBrown
  2015-05-04 22:01       ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: NeilBrown @ 2015-05-01  1:53 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 5434 bytes --]

On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> > On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> > wrote:
> > > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > > i_mutex around lookup_one_len(), if that's the only place we need it?
> > 
> > I think it is the only place needed.  Certainly normal path lookup
> > only takes i_mutex very briefly around __lookup_hash().
> > 
> > One cost would be that we would take the mutex for every name instead
> > of once for a whole set of names.  That is generally frowned upon for
> > performance reasons.
> > 
> > An alternative might be to stop using lookup_one_len, and use
> > kern_path() instead.  Or kern_path_mountpoint if we don't want to
> > cross a mountpoint.
> > 
> > They would only take the i_mutex if absolutely necessary.
> > 
> > The draw back is that they don't take a 'len' argument, so we would
> > need to make sure the name  is nul terminated (and maybe double-check
> > that there is no '/'??).  It would be fairly easy to nul-terminate
> > names from readdir - just use a bit more space  in the buffer in
> > nfsd_buffered_filldir().
> 
> They're also a lot more complicated than lookup_one_len.  Is any of this
> extra stuff they do (audit?) going to bite us?
> 
> For me understanding that would be harder than writing a variant of
> lookup_one_len that took the i_mutex when required.  But I guess that's
> my problem, I should try to understand the lookup code better....

Tempting though ... see below (untested).

While writing that I began to wonder if lookup_one_len is really the right
interface to be used, even though it was introduced (in 2.3.99pre2-4)
specifically for nfsd.
The problem is that it assumes things about the filesystem.  So it makes
perfect sense for various filesystems to use it on themselves, but I'm not
sure how *right* it is for nfsd (or cachefiles etc) to use it on some
*other* filesystem.
The particular issue is that it avoids the d_revalidate call.
Both vfat and reiserfs have that call ... I wonder if that could ever be a
problem.

So I'm really leaning towards creating a variant of kern_path_mountpoint and
using a variant of that which takes a length.

I think adding the d_revalidate is correct  and even adding auditing is
probably a good idea.

Maybe I'll try that (unless someone beats me to it...)

NeilBrown


> 
> > I'm less sure about the locking needed in nfsd_lookup_dentry().  The
> > comments suggests that there is good reason to keep the lock for an
> > extended period.  But that reasoning might only apply to files, and
> > nfsd_mountpoint should  only be true for directories... I hope.
> 
> I thought d_mountpoint() could be true for files, but certainly we won't
> be doing nfs4 opens on directories.
> 
> > A different approach would be to pass NULL for the rqstp to
> > nfsd_cross_mnt().  This will ensure it doesn't block, but it might
> > fail incorrectly.  If it does fail, we drop the lock, retry with the
> > real rqstp, retake the lock and .... no, I think that gets a bit too
> > messy.
> 
> Yes.
> 
> > I think I'm in favour of:
> >  - not taking the lock in readdir
> >  - maybe not taking it in lookup
> >  - using kern_path_mountpoint or kern_path
> >  - nul terminating then names, copying the nfsd_lookup name to
> >    __getname() if necessary.
> > 
> > Sound reasonable?
> 
> I guess so, though I wish I understood kern_path_mountpoint better.
> 
> And nfsd_lookup_dentry looks like work for another day.  No, wait, I
> forgot the goal here: you're right, nfsd_lookup_dentry has the same
> upcall-under-i_mutex problem, so we need to fix it too.
> 
> OK, agreed.
> 
> --b.


diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..d6b2dc36029e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2182,6 +2182,56 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		retrun ret;
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-01  1:53     ` NeilBrown
@ 2015-05-04 22:01       ` J. Bruce Fields
  2015-05-05 13:54         ` Kinglong Mee
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-05-04 22:01 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> wrote:
> 
> > On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
> > > On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> > > wrote:
> > > > Maybe drop the locking from nfsd_buffered_readdir and *just* take the
> > > > i_mutex around lookup_one_len(), if that's the only place we need it?
> > > 
> > > I think it is the only place needed.  Certainly normal path lookup
> > > only takes i_mutex very briefly around __lookup_hash().
> > > 
> > > One cost would be that we would take the mutex for every name instead
> > > of once for a whole set of names.  That is generally frowned upon for
> > > performance reasons.
> > > 
> > > An alternative might be to stop using lookup_one_len, and use
> > > kern_path() instead.  Or kern_path_mountpoint if we don't want to
> > > cross a mountpoint.
> > > 
> > > They would only take the i_mutex if absolutely necessary.
> > > 
> > > The draw back is that they don't take a 'len' argument, so we would
> > > need to make sure the name  is nul terminated (and maybe double-check
> > > that there is no '/'??).  It would be fairly easy to nul-terminate
> > > names from readdir - just use a bit more space  in the buffer in
> > > nfsd_buffered_filldir().
> > 
> > They're also a lot more complicated than lookup_one_len.  Is any of this
> > extra stuff they do (audit?) going to bite us?
> > 
> > For me understanding that would be harder than writing a variant of
> > lookup_one_len that took the i_mutex when required.  But I guess that's
> > my problem, I should try to understand the lookup code better....
> 
> Tempting though ... see below (untested).

With documentation and a stab at the nfsd stuff (also untested.  OK, and
pretty much unread.  Compiles, though!)

--b.

commit 6023d45abd1a
Author: NeilBrown <neilb@suse.de>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In may cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..5ec103d4775d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * Must be called with the parented i_mutex held.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2182,6 +2184,68 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..cc7995762190 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
+		if (!S_ISREG(d_inode(dentry)->i_mode)) {
+			/*
+			 * Never mind, we're not going to open this
+			 * anyway.  And we don't want to hold it over
+			 * the userspace upcalls in nfsd_mountpoint. */
+			fh_unlock(fhp);
+		}
 		/*
 		 * check if we have crossed a mount point ...
 		 */
@@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-04 22:01       ` J. Bruce Fields
@ 2015-05-05 13:54         ` Kinglong Mee
  2015-05-05 14:18           ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: Kinglong Mee @ 2015-05-05 13:54 UTC (permalink / raw)
  To: J. Bruce Fields, NeilBrown; +Cc: linux-nfs@vger.kernel.org, kinglongmee

On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> On Fri, May 01, 2015 at 11:53:26AM +1000, NeilBrown wrote:
>> On Thu, 30 Apr 2015 17:36:02 -0400 "J. Bruce Fields" <bfields@fieldses.org>
>> wrote:
>>
>>> On Thu, Apr 30, 2015 at 07:52:25AM +1000, NeilBrown wrote:
>>>> On Wed, 29 Apr 2015 15:19:34 -0400 "J. Bruce Fields" <bfields@fieldses.org>
>>>> wrote:
>>>>> Maybe drop the locking from nfsd_buffered_readdir and *just* take the
>>>>> i_mutex around lookup_one_len(), if that's the only place we need it?
>>>>
>>>> I think it is the only place needed.  Certainly normal path lookup
>>>> only takes i_mutex very briefly around __lookup_hash().
>>>>
>>>> One cost would be that we would take the mutex for every name instead
>>>> of once for a whole set of names.  That is generally frowned upon for
>>>> performance reasons.
>>>>
>>>> An alternative might be to stop using lookup_one_len, and use
>>>> kern_path() instead.  Or kern_path_mountpoint if we don't want to
>>>> cross a mountpoint.
>>>>
>>>> They would only take the i_mutex if absolutely necessary.
>>>>
>>>> The draw back is that they don't take a 'len' argument, so we would
>>>> need to make sure the name  is nul terminated (and maybe double-check
>>>> that there is no '/'??).  It would be fairly easy to nul-terminate
>>>> names from readdir - just use a bit more space  in the buffer in
>>>> nfsd_buffered_filldir().
>>>
>>> They're also a lot more complicated than lookup_one_len.  Is any of this
>>> extra stuff they do (audit?) going to bite us?
>>>
>>> For me understanding that would be harder than writing a variant of
>>> lookup_one_len that took the i_mutex when required.  But I guess that's
>>> my problem, I should try to understand the lookup code better....
>>
>> Tempting though ... see below (untested).
> 
> With documentation and a stab at the nfsd stuff (also untested.  OK, and
> pretty much unread.  Compiles, though!)
> 
> --b.
> 
> commit 6023d45abd1a
> Author: NeilBrown <neilb@suse.de>
> Date:   Fri May 1 11:53:26 2015 +1000
> 
>     nfsd: don't hold i_mutex over userspace upcalls
>     
>     We need information about exports when crossing mountpoints during
>     lookup or NFSv4 readdir.  If we don't already have that information
>     cached, we may have to ask (and wait for) rpc.mountd.
>     
>     In both cases we currently hold the i_mutex on the parent of the
>     directory we're asking rpc.mountd about.  We've seen situations where
>     rpc.mountd performs some operation on that directory that tries to take
>     the i_mutex again, resulting in deadlock.
>     
>     With some care, we may be able to avoid that in rpc.mountd.  But it
>     seems better just to avoid holding a mutex while waiting on userspace.
>     
>     It appears that lookup_one_len is pretty much the only operation that
>     needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>     something like
>     
>     	mutex_lock()
>     	lookup_one_len()
>     	mutex_unlock()
>     
>     In may cases though the lookup would have been cached and not required
>     the i_mutex, so it's more efficient to create a lookup_one_len() variant
>     that only takes the i_mutex when necessary.
>     
>     Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..5ec103d4775d 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.
>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,68 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));

Remove this line.

> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;
> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..cc7995762190 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> +		if (!S_ISREG(d_inode(dentry)->i_mode)) {

Got a crash here tested by pynfs,
# ./testserver.py 127.0.0.1:/nfs/pnfs/ --rundeps --maketree open

PID: 2314   TASK: ffff88006ad2cfc0  CPU: 0   COMMAND: "nfsd"
 #0 [ffff88006adc7870] machine_kexec at ffffffff8104debb
 #1 [ffff88006adc78e0] crash_kexec at ffffffff810f92a2
 #2 [ffff88006adc79b0] oops_end at ffffffff81015c28
 #3 [ffff88006adc79e0] no_context at ffffffff8105a9af
 #4 [ffff88006adc7a50] __bad_area_nosemaphore at ffffffff8105ac90
 #5 [ffff88006adc7aa0] bad_area_nosemaphore at ffffffff8105ae23
 #6 [ffff88006adc7ab0] __do_page_fault at ffffffff8105b13e
 #7 [ffff88006adc7b10] do_page_fault at ffffffff8105b4df
 #8 [ffff88006adc7b50] page_fault at ffffffff81713258
    [exception RIP: nfsd_lookup_dentry+231]
    RIP: ffffffffa0447b87  RSP: ffff88006adc7c08  RFLAGS: 00010283
    RAX: ffff88006a9d2180  RBX: ffff88006ad80068  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: ffff88007fdf4eb0  RDI: ffff88006a9d2180
    RBP: ffff88006adc7c88   R8: ffff88006b13dcc0   R9: 0000000000000000
    R10: ffff88006adb6cc0  R11: ffff88006adb6cc0  R12: ffff88006a9d0cc0
    R13: ffff88006adc7ca0  R14: ffff88006adc7ca8  R15: ffff88006ad640bc
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff88006adc7c90] nfsd_lookup at ffffffffa0448029 [nfsd]
#10 [ffff88006adc7cf0] nfsd4_open at ffffffffa0456cbb [nfsd]
#11 [ffff88006adc7d60] nfsd4_proc_compound at ffffffffa0457317 [nfsd]
#12 [ffff88006adc7dc0] nfsd_dispatch at ffffffffa0442f83 [nfsd]
#13 [ffff88006adc7e00] svc_process_common at ffffffffa0160c5b [sunrpc]
#14 [ffff88006adc7e70] svc_process at ffffffffa0161cc3 [sunrpc]
#15 [ffff88006adc7ea0] nfsd at ffffffffa044294f [nfsd]
#16 [ffff88006adc7ed0] kthread at ffffffff810a9d07
#17 [ffff88006adc7f50] ret_from_fork at ffffffff81711b62

thanks,
Kinglong Mee

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-05 13:54         ` Kinglong Mee
@ 2015-05-05 14:18           ` J. Bruce Fields
  2015-05-05 15:52             ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-05-05 14:18 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: NeilBrown, linux-nfs@vger.kernel.org

On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > + * Unlike lookup_one_len, it should be called without the parent
> > + * i_mutex held, and will take the i_mutex itself if necessary.
> > + */
> > +struct dentry *lookup_one_len_unlocked(const char *name,
> > +				       struct dentry *base, int len)
> > +{
> > +	struct qstr this;
> > +	unsigned int c;
> > +	int err;
> > +	struct dentry *ret;
> > +
> > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> 
> Remove this line.

Whoops, thanks.

> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a30e79900086..cc7995762190 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		host_err = PTR_ERR(dentry);
> >  		if (IS_ERR(dentry))
> >  			goto out_nfserr;
> > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> 
> Got a crash here tested by pynfs,

OK, I guess I just forgot to take into account negative dentries.
Testing that now with (!(d_inode(dentry) && S_ISREG(..))).

--b.

> # ./testserver.py 127.0.0.1:/nfs/pnfs/ --rundeps --maketree open
> 
> PID: 2314   TASK: ffff88006ad2cfc0  CPU: 0   COMMAND: "nfsd"
>  #0 [ffff88006adc7870] machine_kexec at ffffffff8104debb
>  #1 [ffff88006adc78e0] crash_kexec at ffffffff810f92a2
>  #2 [ffff88006adc79b0] oops_end at ffffffff81015c28
>  #3 [ffff88006adc79e0] no_context at ffffffff8105a9af
>  #4 [ffff88006adc7a50] __bad_area_nosemaphore at ffffffff8105ac90
>  #5 [ffff88006adc7aa0] bad_area_nosemaphore at ffffffff8105ae23
>  #6 [ffff88006adc7ab0] __do_page_fault at ffffffff8105b13e
>  #7 [ffff88006adc7b10] do_page_fault at ffffffff8105b4df
>  #8 [ffff88006adc7b50] page_fault at ffffffff81713258
>     [exception RIP: nfsd_lookup_dentry+231]
>     RIP: ffffffffa0447b87  RSP: ffff88006adc7c08  RFLAGS: 00010283
>     RAX: ffff88006a9d2180  RBX: ffff88006ad80068  RCX: 0000000000000000
>     RDX: 0000000000000000  RSI: ffff88007fdf4eb0  RDI: ffff88006a9d2180
>     RBP: ffff88006adc7c88   R8: ffff88006b13dcc0   R9: 0000000000000000
>     R10: ffff88006adb6cc0  R11: ffff88006adb6cc0  R12: ffff88006a9d0cc0
>     R13: ffff88006adc7ca0  R14: ffff88006adc7ca8  R15: ffff88006ad640bc
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>  #9 [ffff88006adc7c90] nfsd_lookup at ffffffffa0448029 [nfsd]
> #10 [ffff88006adc7cf0] nfsd4_open at ffffffffa0456cbb [nfsd]
> #11 [ffff88006adc7d60] nfsd4_proc_compound at ffffffffa0457317 [nfsd]
> #12 [ffff88006adc7dc0] nfsd_dispatch at ffffffffa0442f83 [nfsd]
> #13 [ffff88006adc7e00] svc_process_common at ffffffffa0160c5b [sunrpc]
> #14 [ffff88006adc7e70] svc_process at ffffffffa0161cc3 [sunrpc]
> #15 [ffff88006adc7ea0] nfsd at ffffffffa044294f [nfsd]
> #16 [ffff88006adc7ed0] kthread at ffffffff810a9d07
> #17 [ffff88006adc7f50] ret_from_fork at ffffffff81711b62
> 
> thanks,
> Kinglong Mee

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-05 14:18           ` J. Bruce Fields
@ 2015-05-05 15:52             ` J. Bruce Fields
  2015-05-05 22:26               ` NeilBrown
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-05-05 15:52 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: NeilBrown, linux-nfs@vger.kernel.org

On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > + * Unlike lookup_one_len, it should be called without the parent
> > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > + */
> > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > +				       struct dentry *base, int len)
> > > +{
> > > +	struct qstr this;
> > > +	unsigned int c;
> > > +	int err;
> > > +	struct dentry *ret;
> > > +
> > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > 
> > Remove this line.
> 
> Whoops, thanks.
> 
> > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > index a30e79900086..cc7995762190 100644
> > > --- a/fs/nfsd/vfs.c
> > > +++ b/fs/nfsd/vfs.c
> > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > >  		host_err = PTR_ERR(dentry);
> > >  		if (IS_ERR(dentry))
> > >  			goto out_nfserr;
> > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > 
> > Got a crash here tested by pynfs,
> 
> OK, I guess I just forgot to take into account negative dentries.
> Testing that now with (!(d_inode(dentry) && S_ISREG(..))).

And I also forgot nfs3xdr.c.

--b.

commit 6c942d342f9f
Author: NeilBrown <neilb@suse.de>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In many cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..0f554bf41dea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * Must be called with the parented i_mutex held.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..4accdb00976b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
+		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
+			/*
+			 * Never mind, we're not going to open this
+			 * anyway.  And we don't want to hold it over
+			 * the userspace upcalls in nfsd_mountpoint. */
+			fh_unlock(fhp);
+		}
 		/*
 		 * check if we have crossed a mount point ...
 		 */
@@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-05 15:52             ` J. Bruce Fields
@ 2015-05-05 22:26               ` NeilBrown
  2015-05-08 16:15                 ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: NeilBrown @ 2015-05-05 22:26 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 10922 bytes --]

On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <bfields@fieldses.org>
wrote:

> On Tue, May 05, 2015 at 10:18:01AM -0400, J. Bruce Fields wrote:
> > On Tue, May 05, 2015 at 09:54:43PM +0800, Kinglong Mee wrote:
> > > On 5/5/2015 6:01 AM, J. Bruce Fields wrote:
> > > > + * Unlike lookup_one_len, it should be called without the parent
> > > > + * i_mutex held, and will take the i_mutex itself if necessary.
> > > > + */
> > > > +struct dentry *lookup_one_len_unlocked(const char *name,
> > > > +				       struct dentry *base, int len)
> > > > +{
> > > > +	struct qstr this;
> > > > +	unsigned int c;
> > > > +	int err;
> > > > +	struct dentry *ret;
> > > > +
> > > > +	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
> > > 
> > > Remove this line.
> > 
> > Whoops, thanks.
> > 
> > > > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > > > index a30e79900086..cc7995762190 100644
> > > > --- a/fs/nfsd/vfs.c
> > > > +++ b/fs/nfsd/vfs.c
> > > > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > > >  		host_err = PTR_ERR(dentry);
> > > >  		if (IS_ERR(dentry))
> > > >  			goto out_nfserr;
> > > > +		if (!S_ISREG(d_inode(dentry)->i_mode)) {
> > > 
> > > Got a crash here tested by pynfs,
> > 
> > OK, I guess I just forgot to take into account negative dentries.
> > Testing that now with (!(d_inode(dentry) && S_ISREG(..))).
> 
> And I also forgot nfs3xdr.c.
> 
> --b.
> 
> commit 6c942d342f9f
> Author: NeilBrown <neilb@suse.de>
> Date:   Fri May 1 11:53:26 2015 +1000
> 
>     nfsd: don't hold i_mutex over userspace upcalls
>     
>     We need information about exports when crossing mountpoints during
>     lookup or NFSv4 readdir.  If we don't already have that information
>     cached, we may have to ask (and wait for) rpc.mountd.
>     
>     In both cases we currently hold the i_mutex on the parent of the
>     directory we're asking rpc.mountd about.  We've seen situations where
>     rpc.mountd performs some operation on that directory that tries to take
>     the i_mutex again, resulting in deadlock.
>     
>     With some care, we may be able to avoid that in rpc.mountd.  But it
>     seems better just to avoid holding a mutex while waiting on userspace.
>     
>     It appears that lookup_one_len is pretty much the only operation that
>     needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>     something like
>     
>     	mutex_lock()
>     	lookup_one_len()
>     	mutex_unlock()
>     
>     In many cases though the lookup would have been cached and not required
>     the i_mutex, so it's more efficient to create a lookup_one_len() variant
>     that only takes the i_mutex when necessary.
>     
>     Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..0f554bf41dea 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * Must be called with the parented i_mutex held.

"parented"?

    * base->i_mutex must be held by caller.

??


>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,66 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len - filesystem helper to lookup single pathname component
      lookup_one_len_unlocked - ..



> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;

__d_lookup() is a little too low level, as it doesn't call d_revalidate() and
it doesn't retry if the rename_lock seqlock says it should.

The latter doesn't really matter as we would fall through to the safe
mutex-protected version.
The former isn't much of an issue for most filesystems under nfsd, but still
needs to be handled to some extent.
Also, the comment for __d_lookup says

 * __d_lookup callers must be commented.

The simplest resolution would be:

	/* __d_lookup() is used to try to get a quick answer and avoid the
	 * mutex.  A false-negative does no harm.
	 */
	ret = __d_lookup(base, &this);
	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		dput(ret);
		ret = NULL;
	}
	if (ret)
		return ret;


It probably wouldn't be too much harder to add the d_revalidate() call, and
call d_invalidate() if it returns zero.  

Maybe.


	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
		int err = d_revalidate(ret, 0);
		if (err == 0) {
			d_invalidate(ret);
			dput(ret);
			ret = NULL;
		} else if (err < 0) {
			dput(ret);
			return ERR_PTR(err);
		}
	}


> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  		} else
>  			dchild = dget(dparent);
>  	} else
> -		dchild = lookup_one_len(name, dparent, namlen);
> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>  	if (IS_ERR(dchild))
>  		return rv;
>  	if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..4accdb00976b 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> +		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> +			/*
> +			 * Never mind, we're not going to open this
> +			 * anyway.  And we don't want to hold it over
> +			 * the userspace upcalls in nfsd_mountpoint. */
> +			fh_unlock(fhp);
> +		}

You could avoid the test by just putting the fh_unlock(fhp) inside the 

		if (nfsd_mountpoint(dentry, exp)) {

block.  It might also be appropriate for nfsd_mountpoint to fail on
non-directories.

Up to you though.

Thanks.  Feel free to add
  *-by: NeilBrown <neilb@suse.de>

as you see fit.

NeilBrown


>  		/*
>  		 * check if we have crossed a mount point ...
>  		 */
> @@ -1876,7 +1883,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  	offset = *offsetp;
>  
>  	while (1) {
> -		struct inode *dir_inode = file_inode(file);
>  		unsigned int reclen;
>  
>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1901,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  		if (!size)
>  			break;
>  
> -		/*
> -		 * Various filldir functions may end up calling back into
> -		 * lookup_one_len() and the file system's ->lookup() method.
> -		 * These expect i_mutex to be held, as it would within readdir.
> -		 */
> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> -		if (host_err)
> -			break;
> -
>  		de = (struct buffered_dirent *)buf.dirent;
>  		while (size > 0) {
>  			offset = de->offset;
> @@ -1920,7 +1917,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  			size -= reclen;
>  			de = (struct buffered_dirent *)((char *)de + reclen);
>  		}
> -		mutex_unlock(&dir_inode->i_mutex);
>  		if (size > 0) /* We bailed out early */
>  			break;
>  
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>  
>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>  
>  extern int follow_down_one(struct path *);
>  extern int follow_down(struct path *);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root
  2015-05-05 22:26               ` NeilBrown
@ 2015-05-08 16:15                 ` J. Bruce Fields
  2015-05-08 20:01                   ` [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-05-08 16:15 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

On Wed, May 06, 2015 at 08:26:28AM +1000, NeilBrown wrote:
> On Tue, 5 May 2015 11:52:03 -0400 "J. Bruce Fields" <bfields@fieldses.org>
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 4a8d998b7274..0f554bf41dea 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> >   *
> >   * Note that this routine is purely a helper for filesystem usage and should
> >   * not be called by generic code.
> > + *
> > + * Must be called with the parented i_mutex held.
> 
> "parented"?
> 
>     * base->i_mutex must be held by caller.
> 
> ??

Thanks.

> > +struct dentry *lookup_one_len_unlocked(const char *name,
...
> > +	ret = __d_lookup(base, &this);
> > +	if (ret)
> > +		return ret;
> 
> __d_lookup() is a little too low level, as it doesn't call d_revalidate() and
> it doesn't retry if the rename_lock seqlock says it should.
> 
> The latter doesn't really matter as we would fall through to the safe
> mutex-protected version.
> The former isn't much of an issue for most filesystems under nfsd, but still
> needs to be handled to some extent.
> Also, the comment for __d_lookup says
> 
>  * __d_lookup callers must be commented.
> 
> The simplest resolution would be:
> 
> 	/* __d_lookup() is used to try to get a quick answer and avoid the
> 	 * mutex.  A false-negative does no harm.
> 	 */
> 	ret = __d_lookup(base, &this);
> 	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> 		dput(ret);
> 		ret = NULL;
> 	}
> 	if (ret)
> 		return ret;

That looks right to me....  I'll probably take the simple option.

> > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> > index a30e79900086..4accdb00976b 100644
> > --- a/fs/nfsd/vfs.c
> > +++ b/fs/nfsd/vfs.c
> > @@ -217,6 +217,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >  		host_err = PTR_ERR(dentry);
> >  		if (IS_ERR(dentry))
> >  			goto out_nfserr;
> > +		if (!(d_inode(dentry) && S_ISREG(d_inode(dentry)->i_mode))) {
> > +			/*
> > +			 * Never mind, we're not going to open this
> > +			 * anyway.  And we don't want to hold it over
> > +			 * the userspace upcalls in nfsd_mountpoint. */
> > +			fh_unlock(fhp);
> > +		}
> 
> You could avoid the test by just putting the fh_unlock(fhp) inside the 
> 
> 		if (nfsd_mountpoint(dentry, exp)) {
> 
> block.  It might also be appropriate for nfsd_mountpoint to fail on
> non-directories.

If check_export()'s to be believed, our support for regular-file
exports is intentional. So even if nfsd_mountpoint() is true, it's
possible we might still be about to open the result.

But I think we're OK:
 
The only reason for keeping the i_mutex here in the open case is to
avoid the situation where a client does OPEN(dirfh, "foo") and gets a
reply that grants a delegation even though the file has actually been
renamed to "bar".  (Thus violating the protocol, since the delegation's
supposed to guarantee you'll be notified before the file's renamed.)

(So the i_mutex is actually overkill, it would be enough to detect a
rename and then refuse the delegation (which we're always allowed to
do).)

But actually, if this is a mountpoint then I suppose we're safe from a
rename.  So, right, we can just move that unlock into the mountpoint
case.

--b.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-05-08 16:15                 ` J. Bruce Fields
@ 2015-05-08 20:01                   ` J. Bruce Fields
  2015-06-03 15:18                     ` J. Bruce Fields
  0 siblings, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-05-08 20:01 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org

From: NeilBrown <neilb@suse.de>

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

Here's an updated patch.

diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..8b866d79c5b7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..6d5b33458e91 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-05-08 20:01                   ` [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
@ 2015-06-03 15:18                     ` J. Bruce Fields
  2015-07-05 11:27                       ` Kinglong Mee
  2015-08-18 19:10                       ` J. Bruce Fields
  0 siblings, 2 replies; 7+ messages in thread
From: J. Bruce Fields @ 2015-06-03 15:18 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org, Al Viro, linux-fsdevel

This passes my review, but it needs an ACK from Al or someone for the
addition of the new lookup_one_len_unlocked (which is the same as
lookup_one_len except that it takes the i_mutex itself when required
instead of requiring the caller to).

--b.

On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> From: NeilBrown <neilb@suse.de>
> 
> We need information about exports when crossing mountpoints during
> lookup or NFSv4 readdir.  If we don't already have that information
> cached, we may have to ask (and wait for) rpc.mountd.
> 
> In both cases we currently hold the i_mutex on the parent of the
> directory we're asking rpc.mountd about.  We've seen situations where
> rpc.mountd performs some operation on that directory that tries to take
> the i_mutex again, resulting in deadlock.
> 
> With some care, we may be able to avoid that in rpc.mountd.  But it
> seems better just to avoid holding a mutex while waiting on userspace.
> 
> It appears that lookup_one_len is pretty much the only operation that
> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
> something like
> 
> 	mutex_lock()
> 	lookup_one_len()
> 	mutex_unlock()
> 
> In many cases though the lookup would have been cached and not required
> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> that only takes the i_mutex when necessary.
> 
> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> ---
>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs3xdr.c     |  2 +-
>  fs/nfsd/nfs4xdr.c     |  8 +++---
>  fs/nfsd/vfs.c         | 23 +++++++---------
>  include/linux/namei.h |  1 +
>  5 files changed, 89 insertions(+), 19 deletions(-)
> 
> Here's an updated patch.
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..8b866d79c5b7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * The caller must hold base->i_mutex.
>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * __d_lookup() is used to try to get a quick answer and avoid the
> +	 * mutex.  A false-negative does no harm.
> +	 */
> +	ret = __d_lookup(base, &this);
> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> +		dput(ret);
> +		ret = NULL;
> +	}
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  		} else
>  			dchild = dget(dparent);
>  	} else
> -		dchild = lookup_one_len(name, dparent, namlen);
> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>  	if (IS_ERR(dchild))
>  		return rv;
>  	if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..6d5b33458e91 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> -		/*
> -		 * check if we have crossed a mount point ...
> -		 */
>  		if (nfsd_mountpoint(dentry, exp)) {
> +			/*
> +			 * We don't need the i_mutex after all.  It's
> +			 * still possible we could open this (regular
> +			 * files can be mountpoints too), but the
> +			 * i_mutex is just there to prevent renames of
> +			 * something that we might be about to delegate,
> +			 * and a mountpoint won't be renamed:
> +			 */
> +			fh_unlock(fhp);
>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>  				dput(dentry);
>  				goto out_nfserr;
> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  	offset = *offsetp;
>  
>  	while (1) {
> -		struct inode *dir_inode = file_inode(file);
>  		unsigned int reclen;
>  
>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  		if (!size)
>  			break;
>  
> -		/*
> -		 * Various filldir functions may end up calling back into
> -		 * lookup_one_len() and the file system's ->lookup() method.
> -		 * These expect i_mutex to be held, as it would within readdir.
> -		 */
> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> -		if (host_err)
> -			break;
> -
>  		de = (struct buffered_dirent *)buf.dirent;
>  		while (size > 0) {
>  			offset = de->offset;
> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  			size -= reclen;
>  			de = (struct buffered_dirent *)((char *)de + reclen);
>  		}
> -		mutex_unlock(&dir_inode->i_mutex);
>  		if (size > 0) /* We bailed out early */
>  			break;
>  
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>  
>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>  
>  extern int follow_down_one(struct path *);
>  extern int follow_down(struct path *);
> -- 
> 1.9.3
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-06-03 15:18                     ` J. Bruce Fields
@ 2015-07-05 11:27                       ` Kinglong Mee
  2015-07-06 18:22                         ` J. Bruce Fields
  2015-08-18 19:10                       ` J. Bruce Fields
  1 sibling, 1 reply; 7+ messages in thread
From: Kinglong Mee @ 2015-07-05 11:27 UTC (permalink / raw)
  To: J. Bruce Fields, NeilBrown, Al Viro
  Cc: linux-nfs@vger.kernel.org, linux-fsdevel, kinglongmee

Ping...

What's the state of this patch ?
Without modify nfs-utils, this one is make sense.

thanks,
Kinglong Mee
On 6/3/2015 23:18, J. Bruce Fields wrote:
> This passes my review, but it needs an ACK from Al or someone for the
> addition of the new lookup_one_len_unlocked (which is the same as
> lookup_one_len except that it takes the i_mutex itself when required
> instead of requiring the caller to).
> 
> --b.
> 
> On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
>> From: NeilBrown <neilb@suse.de>
>>
>> We need information about exports when crossing mountpoints during
>> lookup or NFSv4 readdir.  If we don't already have that information
>> cached, we may have to ask (and wait for) rpc.mountd.
>>
>> In both cases we currently hold the i_mutex on the parent of the
>> directory we're asking rpc.mountd about.  We've seen situations where
>> rpc.mountd performs some operation on that directory that tries to take
>> the i_mutex again, resulting in deadlock.
>>
>> With some care, we may be able to avoid that in rpc.mountd.  But it
>> seems better just to avoid holding a mutex while waiting on userspace.
>>
>> It appears that lookup_one_len is pretty much the only operation that
>> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>> something like
>>
>> 	mutex_lock()
>> 	lookup_one_len()
>> 	mutex_unlock()
>>
>> In many cases though the lookup would have been cached and not required
>> the i_mutex, so it's more efficient to create a lookup_one_len() variant
>> that only takes the i_mutex when necessary.
>>
>> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>> ---
>>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs3xdr.c     |  2 +-
>>  fs/nfsd/nfs4xdr.c     |  8 +++---
>>  fs/nfsd/vfs.c         | 23 +++++++---------
>>  include/linux/namei.h |  1 +
>>  5 files changed, 89 insertions(+), 19 deletions(-)
>>
>> Here's an updated patch.
>>
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 4a8d998b7274..8b866d79c5b7 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>>   *
>>   * Note that this routine is purely a helper for filesystem usage and should
>>   * not be called by generic code.
>> + *
>> + * The caller must hold base->i_mutex.
>>   */
>>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  {
>> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  }
>>  EXPORT_SYMBOL(lookup_one_len);
>>  
>> +/**
>> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
>> + * @name:	pathname component to lookup
>> + * @base:	base directory to lookup from
>> + * @len:	maximum length @len should be interpreted to
>> + *
>> + * Note that this routine is purely a helper for filesystem usage and should
>> + * not be called by generic code.
>> + *
>> + * Unlike lookup_one_len, it should be called without the parent
>> + * i_mutex held, and will take the i_mutex itself if necessary.
>> + */
>> +struct dentry *lookup_one_len_unlocked(const char *name,
>> +				       struct dentry *base, int len)
>> +{
>> +	struct qstr this;
>> +	unsigned int c;
>> +	int err;
>> +	struct dentry *ret;
>> +
>> +	this.name = name;
>> +	this.len = len;
>> +	this.hash = full_name_hash(name, len);
>> +	if (!len)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	if (unlikely(name[0] == '.')) {
>> +		if (len < 2 || (len == 2 && name[1] == '.'))
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +
>> +	while (len--) {
>> +		c = *(const unsigned char *)name++;
>> +		if (c == '/' || c == '\0')
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +	/*
>> +	 * See if the low-level filesystem might want
>> +	 * to use its own hash..
>> +	 */
>> +	if (base->d_flags & DCACHE_OP_HASH) {
>> +		int err = base->d_op->d_hash(base, &this);
>> +		if (err < 0)
>> +			return ERR_PTR(err);
>> +	}
>> +
>> +	err = inode_permission(base->d_inode, MAY_EXEC);
>> +	if (err)
>> +		return ERR_PTR(err);
>> +
>> +	ret = __d_lookup(base, &this);
>> +	if (ret)
>> +		return ret;
>> +	/*
>> +	 * __d_lookup() is used to try to get a quick answer and avoid the
>> +	 * mutex.  A false-negative does no harm.
>> +	 */
>> +	ret = __d_lookup(base, &this);
>> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
>> +		dput(ret);
>> +		ret = NULL;
>> +	}
>> +	if (ret)
>> +		return ret;
>> +
>> +	mutex_lock(&base->d_inode->i_mutex);
>> +	ret =  __lookup_hash(&this, base, 0);
>> +	mutex_unlock(&base->d_inode->i_mutex);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(lookup_one_len_unlocked);
>> +
>>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>>  		 struct path *path, int *empty)
>>  {
>> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> index f6e7cbabac5a..01dcd494f781 100644
>> --- a/fs/nfsd/nfs3xdr.c
>> +++ b/fs/nfsd/nfs3xdr.c
>> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>>  		} else
>>  			dchild = dget(dparent);
>>  	} else
>> -		dchild = lookup_one_len(name, dparent, namlen);
>> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>>  	if (IS_ERR(dchild))
>>  		return rv;
>>  	if (d_mountpoint(dchild))
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 158badf945df..2c1adaa0bd2f 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>>  	__be32 nfserr;
>>  	int ignore_crossmnt = 0;
>>  
>> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
>> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>>  	if (IS_ERR(dentry))
>>  		return nfserrno(PTR_ERR(dentry));
>>  	if (d_really_is_negative(dentry)) {
>>  		/*
>> -		 * nfsd_buffered_readdir drops the i_mutex between
>> -		 * readdir and calling this callback, leaving a window
>> -		 * where this directory entry could have gone away.
>> +		 * we're not holding the i_mutex here, so there's
>> +		 * a window where this directory entry could have gone
>> +		 * away.
>>  		 */
>>  		dput(dentry);
>>  		return nfserr_noent;
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index a30e79900086..6d5b33458e91 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  		host_err = PTR_ERR(dentry);
>>  		if (IS_ERR(dentry))
>>  			goto out_nfserr;
>> -		/*
>> -		 * check if we have crossed a mount point ...
>> -		 */
>>  		if (nfsd_mountpoint(dentry, exp)) {
>> +			/*
>> +			 * We don't need the i_mutex after all.  It's
>> +			 * still possible we could open this (regular
>> +			 * files can be mountpoints too), but the
>> +			 * i_mutex is just there to prevent renames of
>> +			 * something that we might be about to delegate,
>> +			 * and a mountpoint won't be renamed:
>> +			 */
>> +			fh_unlock(fhp);
>>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>>  				dput(dentry);
>>  				goto out_nfserr;
>> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  	offset = *offsetp;
>>  
>>  	while (1) {
>> -		struct inode *dir_inode = file_inode(file);
>>  		unsigned int reclen;
>>  
>>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
>> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  		if (!size)
>>  			break;
>>  
>> -		/*
>> -		 * Various filldir functions may end up calling back into
>> -		 * lookup_one_len() and the file system's ->lookup() method.
>> -		 * These expect i_mutex to be held, as it would within readdir.
>> -		 */
>> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
>> -		if (host_err)
>> -			break;
>> -
>>  		de = (struct buffered_dirent *)buf.dirent;
>>  		while (size > 0) {
>>  			offset = de->offset;
>> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  			size -= reclen;
>>  			de = (struct buffered_dirent *)((char *)de + reclen);
>>  		}
>> -		mutex_unlock(&dir_inode->i_mutex);
>>  		if (size > 0) /* We bailed out early */
>>  			break;
>>  
>> diff --git a/include/linux/namei.h b/include/linux/namei.h
>> index c8990779f0c3..bb3a2f7cca67 100644
>> --- a/include/linux/namei.h
>> +++ b/include/linux/namei.h
>> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>>  
>>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
>> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>>  
>>  extern int follow_down_one(struct path *);
>>  extern int follow_down(struct path *);
>> -- 
>> 1.9.3
>>
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-07-05 11:27                       ` Kinglong Mee
@ 2015-07-06 18:22                         ` J. Bruce Fields
  0 siblings, 0 replies; 7+ messages in thread
From: J. Bruce Fields @ 2015-07-06 18:22 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: NeilBrown, Al Viro, linux-nfs@vger.kernel.org, linux-fsdevel

On Sun, Jul 05, 2015 at 07:27:25PM +0800, Kinglong Mee wrote:
> Ping...
> 
> What's the state of this patch ?

I think I still need an ACK from Al for the addition of
lookup_one_len_unlocked.

--b.

> On 6/3/2015 23:18, J. Bruce Fields wrote:
> > This passes my review, but it needs an ACK from Al or someone for the
> > addition of the new lookup_one_len_unlocked (which is the same as
> > lookup_one_len except that it takes the i_mutex itself when required
> > instead of requiring the caller to).
> > 
> > --b.
> > 
> > On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> >> From: NeilBrown <neilb@suse.de>
> >>
> >> We need information about exports when crossing mountpoints during
> >> lookup or NFSv4 readdir.  If we don't already have that information
> >> cached, we may have to ask (and wait for) rpc.mountd.
> >>
> >> In both cases we currently hold the i_mutex on the parent of the
> >> directory we're asking rpc.mountd about.  We've seen situations where
> >> rpc.mountd performs some operation on that directory that tries to take
> >> the i_mutex again, resulting in deadlock.
> >>
> >> With some care, we may be able to avoid that in rpc.mountd.  But it
> >> seems better just to avoid holding a mutex while waiting on userspace.
> >>
> >> It appears that lookup_one_len is pretty much the only operation that
> >> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
> >> something like
> >>
> >> 	mutex_lock()
> >> 	lookup_one_len()
> >> 	mutex_unlock()
> >>
> >> In many cases though the lookup would have been cached and not required
> >> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> >> that only takes the i_mutex when necessary.
> >>
> >> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> >> ---
> >>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  fs/nfsd/nfs3xdr.c     |  2 +-
> >>  fs/nfsd/nfs4xdr.c     |  8 +++---
> >>  fs/nfsd/vfs.c         | 23 +++++++---------
> >>  include/linux/namei.h |  1 +
> >>  5 files changed, 89 insertions(+), 19 deletions(-)
> >>
> >> Here's an updated patch.
> >>
> >> diff --git a/fs/namei.c b/fs/namei.c
> >> index 4a8d998b7274..8b866d79c5b7 100644
> >> --- a/fs/namei.c
> >> +++ b/fs/namei.c
> >> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> >>   *
> >>   * Note that this routine is purely a helper for filesystem usage and should
> >>   * not be called by generic code.
> >> + *
> >> + * The caller must hold base->i_mutex.
> >>   */
> >>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >>  {
> >> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >>  }
> >>  EXPORT_SYMBOL(lookup_one_len);
> >>  
> >> +/**
> >> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> >> + * @name:	pathname component to lookup
> >> + * @base:	base directory to lookup from
> >> + * @len:	maximum length @len should be interpreted to
> >> + *
> >> + * Note that this routine is purely a helper for filesystem usage and should
> >> + * not be called by generic code.
> >> + *
> >> + * Unlike lookup_one_len, it should be called without the parent
> >> + * i_mutex held, and will take the i_mutex itself if necessary.
> >> + */
> >> +struct dentry *lookup_one_len_unlocked(const char *name,
> >> +				       struct dentry *base, int len)
> >> +{
> >> +	struct qstr this;
> >> +	unsigned int c;
> >> +	int err;
> >> +	struct dentry *ret;
> >> +
> >> +	this.name = name;
> >> +	this.len = len;
> >> +	this.hash = full_name_hash(name, len);
> >> +	if (!len)
> >> +		return ERR_PTR(-EACCES);
> >> +
> >> +	if (unlikely(name[0] == '.')) {
> >> +		if (len < 2 || (len == 2 && name[1] == '.'))
> >> +			return ERR_PTR(-EACCES);
> >> +	}
> >> +
> >> +	while (len--) {
> >> +		c = *(const unsigned char *)name++;
> >> +		if (c == '/' || c == '\0')
> >> +			return ERR_PTR(-EACCES);
> >> +	}
> >> +	/*
> >> +	 * See if the low-level filesystem might want
> >> +	 * to use its own hash..
> >> +	 */
> >> +	if (base->d_flags & DCACHE_OP_HASH) {
> >> +		int err = base->d_op->d_hash(base, &this);
> >> +		if (err < 0)
> >> +			return ERR_PTR(err);
> >> +	}
> >> +
> >> +	err = inode_permission(base->d_inode, MAY_EXEC);
> >> +	if (err)
> >> +		return ERR_PTR(err);
> >> +
> >> +	ret = __d_lookup(base, &this);
> >> +	if (ret)
> >> +		return ret;
> >> +	/*
> >> +	 * __d_lookup() is used to try to get a quick answer and avoid the
> >> +	 * mutex.  A false-negative does no harm.
> >> +	 */
> >> +	ret = __d_lookup(base, &this);
> >> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> >> +		dput(ret);
> >> +		ret = NULL;
> >> +	}
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	mutex_lock(&base->d_inode->i_mutex);
> >> +	ret =  __lookup_hash(&this, base, 0);
> >> +	mutex_unlock(&base->d_inode->i_mutex);
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> >> +
> >>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> >>  		 struct path *path, int *empty)
> >>  {
> >> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> >> index f6e7cbabac5a..01dcd494f781 100644
> >> --- a/fs/nfsd/nfs3xdr.c
> >> +++ b/fs/nfsd/nfs3xdr.c
> >> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> >>  		} else
> >>  			dchild = dget(dparent);
> >>  	} else
> >> -		dchild = lookup_one_len(name, dparent, namlen);
> >> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
> >>  	if (IS_ERR(dchild))
> >>  		return rv;
> >>  	if (d_mountpoint(dchild))
> >> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> >> index 158badf945df..2c1adaa0bd2f 100644
> >> --- a/fs/nfsd/nfs4xdr.c
> >> +++ b/fs/nfsd/nfs4xdr.c
> >> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> >>  	__be32 nfserr;
> >>  	int ignore_crossmnt = 0;
> >>  
> >> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> >> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> >>  	if (IS_ERR(dentry))
> >>  		return nfserrno(PTR_ERR(dentry));
> >>  	if (d_really_is_negative(dentry)) {
> >>  		/*
> >> -		 * nfsd_buffered_readdir drops the i_mutex between
> >> -		 * readdir and calling this callback, leaving a window
> >> -		 * where this directory entry could have gone away.
> >> +		 * we're not holding the i_mutex here, so there's
> >> +		 * a window where this directory entry could have gone
> >> +		 * away.
> >>  		 */
> >>  		dput(dentry);
> >>  		return nfserr_noent;
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index a30e79900086..6d5b33458e91 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  		host_err = PTR_ERR(dentry);
> >>  		if (IS_ERR(dentry))
> >>  			goto out_nfserr;
> >> -		/*
> >> -		 * check if we have crossed a mount point ...
> >> -		 */
> >>  		if (nfsd_mountpoint(dentry, exp)) {
> >> +			/*
> >> +			 * We don't need the i_mutex after all.  It's
> >> +			 * still possible we could open this (regular
> >> +			 * files can be mountpoints too), but the
> >> +			 * i_mutex is just there to prevent renames of
> >> +			 * something that we might be about to delegate,
> >> +			 * and a mountpoint won't be renamed:
> >> +			 */
> >> +			fh_unlock(fhp);
> >>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> >>  				dput(dentry);
> >>  				goto out_nfserr;
> >> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  	offset = *offsetp;
> >>  
> >>  	while (1) {
> >> -		struct inode *dir_inode = file_inode(file);
> >>  		unsigned int reclen;
> >>  
> >>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> >> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  		if (!size)
> >>  			break;
> >>  
> >> -		/*
> >> -		 * Various filldir functions may end up calling back into
> >> -		 * lookup_one_len() and the file system's ->lookup() method.
> >> -		 * These expect i_mutex to be held, as it would within readdir.
> >> -		 */
> >> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> >> -		if (host_err)
> >> -			break;
> >> -
> >>  		de = (struct buffered_dirent *)buf.dirent;
> >>  		while (size > 0) {
> >>  			offset = de->offset;
> >> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  			size -= reclen;
> >>  			de = (struct buffered_dirent *)((char *)de + reclen);
> >>  		}
> >> -		mutex_unlock(&dir_inode->i_mutex);
> >>  		if (size > 0) /* We bailed out early */
> >>  			break;
> >>  
> >> diff --git a/include/linux/namei.h b/include/linux/namei.h
> >> index c8990779f0c3..bb3a2f7cca67 100644
> >> --- a/include/linux/namei.h
> >> +++ b/include/linux/namei.h
> >> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
> >>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
> >>  
> >>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> >> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
> >>  
> >>  extern int follow_down_one(struct path *);
> >>  extern int follow_down(struct path *);
> >> -- 
> >> 1.9.3
> >>
> > 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-06-03 15:18                     ` J. Bruce Fields
  2015-07-05 11:27                       ` Kinglong Mee
@ 2015-08-18 19:10                       ` J. Bruce Fields
  2015-11-12 21:22                         ` J. Bruce Fields
  1 sibling, 1 reply; 7+ messages in thread
From: J. Bruce Fields @ 2015-08-18 19:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs@vger.kernel.org, Al Viro, linux-fsdevel

Al, can I get an ACK/NACK for this?

--b.

commit 5494e10316e3
Author: NeilBrown <neilb@suse.de>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In many cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: NeilBrown <neilb@suse.de>
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/namei.c b/fs/namei.c
index ae4e4c18b2ac..1627d0302a59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2250,6 +2250,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2293,6 +2295,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..2ea6c6a37364 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-08-18 19:10                       ` J. Bruce Fields
@ 2015-11-12 21:22                         ` J. Bruce Fields
  0 siblings, 0 replies; 7+ messages in thread
From: J. Bruce Fields @ 2015-11-12 21:22 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, NeilBrown, Kinglong Mee, linux-nfs@vger.kernel.org

From: NeilBrown <neilb@suse.de>
Subject: [PATCH] nfsd: don't hold i_mutex over userspace upcalls

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 726d211db484..d68c21fe4598 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2278,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2321,6 +2323,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-01-07 21:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-01-07 21:08 [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
  -- strict thread matches above, loose matches on Subject: below --
2015-04-29 19:19 [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root J. Bruce Fields
2015-04-29 21:52 ` NeilBrown
2015-04-30 21:36   ` J. Bruce Fields
2015-05-01  1:53     ` NeilBrown
2015-05-04 22:01       ` J. Bruce Fields
2015-05-05 13:54         ` Kinglong Mee
2015-05-05 14:18           ` J. Bruce Fields
2015-05-05 15:52             ` J. Bruce Fields
2015-05-05 22:26               ` NeilBrown
2015-05-08 16:15                 ` J. Bruce Fields
2015-05-08 20:01                   ` [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
2015-06-03 15:18                     ` J. Bruce Fields
2015-07-05 11:27                       ` Kinglong Mee
2015-07-06 18:22                         ` J. Bruce Fields
2015-08-18 19:10                       ` J. Bruce Fields
2015-11-12 21:22                         ` J. Bruce Fields

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).