linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/7] direct I/O fixes
@ 2013-11-14 16:50 Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 1/7] nfs: fix size updates for aio writes Christoph Hellwig
                   ` (6 more replies)
  0 siblings, 7 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs



^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/7] nfs: fix size updates for aio writes
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 2/7] nfs: defer inode_dio_done call until size update is done Christoph Hellwig
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

nfs_file_direct_write only updates the inode size if it succeeded and
returned the number of bytes written.  But in the AIO case nfs_direct_wait
turns the return value into -EIOCBQUEUED and we skip the size update.

Instead the aio completion path should updated it, which this patch
does.  The implementation is a little hacky because there is no obvious
way to find out we are called for a write in nfs_direct_complete.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d71d66c..653c2e8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -222,12 +222,23 @@ out:
  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
  * the iocb is still valid here if this is a synchronous request.
  */
-static void nfs_direct_complete(struct nfs_direct_req *dreq)
+static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 {
+	struct inode *inode = dreq->inode;
+
 	if (dreq->iocb) {
+		loff_t pos = dreq->iocb->ki_pos + dreq->count;
 		long res = (long) dreq->error;
 		if (!res)
 			res = (long) dreq->count;
+
+		if (write) {
+			spin_lock(&inode->i_lock);
+			if (i_size_read(inode) < pos)
+				i_size_write(inode, pos);
+			spin_unlock(&inode->i_lock);
+		}
+
 		aio_complete(dreq->iocb, res, 0);
 	}
 	complete_all(&dreq->completion);
@@ -272,7 +283,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 	}
 out_put:
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+		nfs_direct_complete(dreq, false);
 	hdr->release(hdr);
 }
 
@@ -434,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	}
 
 	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+		nfs_direct_complete(dreq, false);
 	return 0;
 }
 
@@ -594,7 +605,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 			break;
 		default:
 			nfs_inode_dio_write_done(dreq->inode);
-			nfs_direct_complete(dreq);
+			nfs_direct_complete(dreq, true);
 	}
 }
 
@@ -611,7 +622,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
 	nfs_inode_dio_write_done(inode);
-	nfs_direct_complete(dreq);
+	nfs_direct_complete(dreq, true);
 }
 #endif
 
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/7] nfs: defer inode_dio_done call until size update is done
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 1/7] nfs: fix size updates for aio writes Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 3/7] nfs: increment i_dio_count for reads, too Christoph Hellwig
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

We need to have the I/O fully finished before telling the truncate code
that we are done.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 653c2e8..3dd8823 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -226,21 +226,27 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 {
 	struct inode *inode = dreq->inode;
 
-	if (dreq->iocb) {
+	if (dreq->iocb && write) {
 		loff_t pos = dreq->iocb->ki_pos + dreq->count;
+
+		spin_lock(&inode->i_lock);
+		if (i_size_read(inode) < pos)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	if (write) {
+		nfs_zap_mapping(inode, inode->i_mapping);
+		inode_dio_done(inode);
+	}
+
+	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
 			res = (long) dreq->count;
-
-		if (write) {
-			spin_lock(&inode->i_lock);
-			if (i_size_read(inode) < pos)
-				i_size_write(inode, pos);
-			spin_unlock(&inode->i_lock);
-		}
-
 		aio_complete(dreq->iocb, res, 0);
 	}
+
 	complete_all(&dreq->completion);
 
 	nfs_direct_req_release(dreq);
@@ -483,12 +489,6 @@ out:
 	return result;
 }
 
-static void nfs_inode_dio_write_done(struct inode *inode)
-{
-	nfs_zap_mapping(inode, inode->i_mapping);
-	inode_dio_done(inode);
-}
-
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
@@ -604,7 +604,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			nfs_inode_dio_write_done(dreq->inode);
 			nfs_direct_complete(dreq, true);
 	}
 }
@@ -621,7 +620,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_inode_dio_write_done(inode);
 	nfs_direct_complete(dreq, true);
 }
 #endif
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/7] nfs: increment i_dio_count for reads, too
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 1/7] nfs: fix size updates for aio writes Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 2/7] nfs: defer inode_dio_done call until size update is done Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 4/7] nfs: merge nfs_direct_read into nfs_file_direct_read Christoph Hellwig
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

i_dio_count is used to protect dio access against truncate.  We want
to make sure there are no dio reads pending either when doing a
truncate.  I suspect on plain NFS things might work even without
this, but once we use a pnfs layout driver that access backing devices
directly things will go bad without the proper synchronization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3dd8823..62ea415 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -235,10 +235,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 		spin_unlock(&inode->i_lock);
 	}
 
-	if (write) {
+	if (write)
 		nfs_zap_mapping(inode, inode->i_mapping);
-		inode_dio_done(inode);
-	}
+
+	inode_dio_done(inode);
 
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
@@ -419,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 					      loff_t pos, bool uio)
 {
 	struct nfs_pageio_descriptor desc;
+	struct inode *inode = dreq->inode;
 	ssize_t result = -EINVAL;
 	size_t requested_bytes = 0;
 	unsigned long seg;
@@ -427,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
+	atomic_inc(&inode->i_dio_count);
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
@@ -446,6 +448,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
+		inode_dio_done(inode);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 4/7] nfs: merge nfs_direct_read into nfs_file_direct_read
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
                   ` (2 preceding siblings ...)
  2013-11-14 16:50 ` [PATCH 3/7] nfs: increment i_dio_count for reads, too Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 5/7] nfs: merge nfs_direct_write into nfs_file_direct_write Christoph Hellwig
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

Simple code cleanup to prepare for later fixes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |  107 +++++++++++++++++++++++++------------------------------
 1 file changed, 49 insertions(+), 58 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 62ea415..8312796 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -458,14 +458,55 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	return 0;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos, bool uio)
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file.  For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change.  Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos, bool uio)
 {
-	ssize_t result = -ENOMEM;
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
+	ssize_t result = -EINVAL;
+	size_t count;
+
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+		file, count, (long long) pos);
+
+	result = 0;
+	if (!count)
+		goto out;
+
+	result = nfs_sync_mapping(mapping);
+	if (result)
+		goto out;
+
+	task_io_account_read(count);
 
+	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
 		goto out;
@@ -484,8 +525,11 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 
 	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-	if (!result)
+	if (!result) {
 		result = nfs_direct_wait(dreq);
+		if (result > 0)
+			iocb->ki_pos = pos + result;
+	}
 out_release:
 	nfs_direct_req_release(dreq);
 out:
@@ -889,59 +933,6 @@ out:
 }
 
 /**
- * nfs_file_direct_read - file direct read operation for NFS files
- * @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
- * @pos: byte offset in file where reading starts
- *
- * We use this function for direct reads instead of calling
- * generic_file_aio_read() in order to avoid gfar's check to see if
- * the request starts before the end of the file.  For that check
- * to work, we must generate a GETATTR before each direct read, and
- * even then there is a window between the GETATTR and the subsequent
- * READ where the file size could change.  Our preference is simply
- * to do all reads the application wants, and the server will take
- * care of managing the end of file boundary.
- *
- * This function also eliminates unnecessarily updating the file's
- * atime locally, as the NFS server sets the file's atime, and this
- * client must read the updated atime from the server back into its
- * cache.
- */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos, bool uio)
-{
-	ssize_t retval = -EINVAL;
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	size_t count;
-
-	count = iov_length(iov, nr_segs);
-	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-
-	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
-		file, count, (long long) pos);
-
-	retval = 0;
-	if (!count)
-		goto out;
-
-	retval = nfs_sync_mapping(mapping);
-	if (retval)
-		goto out;
-
-	task_io_account_read(count);
-
-	retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
-	if (retval > 0)
-		iocb->ki_pos = pos + retval;
-
-out:
-	return retval;
-}
-
-/**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
  * @iov: vector of user buffers from which to write data
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 5/7] nfs: merge nfs_direct_write into nfs_file_direct_write
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
                   ` (3 preceding siblings ...)
  2013-11-14 16:50 ` [PATCH 4/7] nfs: merge nfs_direct_read into nfs_file_direct_read Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 6/7] nfs: take i_mutex during direct I/O reads Christoph Hellwig
  2013-11-14 16:50 ` [PATCH 7/7] nfs: page cache invalidation for dio Christoph Hellwig
  6 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

Simple code cleanup to prepare for later fixes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   91 +++++++++++++++++++++++++------------------------------
 1 file changed, 41 insertions(+), 50 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8312796..c32db2a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -898,40 +898,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	return 0;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t pos,
-				size_t count, bool uio)
-{
-	ssize_t result = -ENOMEM;
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct nfs_direct_req *dreq;
-	struct nfs_lock_context *l_ctx;
-
-	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		goto out;
-
-	dreq->inode = inode;
-	dreq->bytes_left = count;
-	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
-	l_ctx = nfs_get_lock_context(dreq->ctx);
-	if (IS_ERR(l_ctx)) {
-		result = PTR_ERR(l_ctx);
-		goto out_release;
-	}
-	dreq->l_ctx = l_ctx;
-	if (!is_sync_kiocb(iocb))
-		dreq->iocb = iocb;
-
-	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
-	if (!result)
-		result = nfs_direct_wait(dreq);
-out_release:
-	nfs_direct_req_release(dreq);
-out:
-	return result;
-}
-
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
@@ -957,9 +923,12 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos, bool uio)
 {
-	ssize_t retval = -EINVAL;
+	ssize_t result = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nfs_direct_req *dreq;
+	struct nfs_lock_context *l_ctx;
 	size_t count;
 
 	count = iov_length(iov, nr_segs);
@@ -968,35 +937,57 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, count, (long long) pos);
 
-	retval = generic_write_checks(file, &pos, &count, 0);
-	if (retval)
+	result = generic_write_checks(file, &pos, &count, 0);
+	if (result)
 		goto out;
 
-	retval = -EINVAL;
+	result = -EINVAL;
 	if ((ssize_t) count < 0)
 		goto out;
-	retval = 0;
+	result = 0;
 	if (!count)
 		goto out;
 
-	retval = nfs_sync_mapping(mapping);
-	if (retval)
+	result = nfs_sync_mapping(mapping);
+	if (result)
 		goto out;
 
 	task_io_account_write(count);
 
-	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
-	if (retval > 0) {
-		struct inode *inode = mapping->host;
+	result = -ENOMEM;
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		goto out;
 
-		iocb->ki_pos = pos + retval;
-		spin_lock(&inode->i_lock);
-		if (i_size_read(inode) < iocb->ki_pos)
-			i_size_write(inode, iocb->ki_pos);
-		spin_unlock(&inode->i_lock);
+	dreq->inode = inode;
+	dreq->bytes_left = count;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (IS_ERR(l_ctx)) {
+		result = PTR_ERR(l_ctx);
+		goto out_release;
+	}
+	dreq->l_ctx = l_ctx;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	if (!result) {
+		result = nfs_direct_wait(dreq);
+		if (result > 0) {
+			struct inode *inode = mapping->host;
+
+			iocb->ki_pos = pos + result;
+			spin_lock(&inode->i_lock);
+			if (i_size_read(inode) < iocb->ki_pos)
+				i_size_write(inode, iocb->ki_pos);
+			spin_unlock(&inode->i_lock);
+		}
 	}
+out_release:
+	nfs_direct_req_release(dreq);
 out:
-	return retval;
+	return result;
 }
 
 /**
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
                   ` (4 preceding siblings ...)
  2013-11-14 16:50 ` [PATCH 5/7] nfs: merge nfs_direct_write into nfs_file_direct_write Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 17:00   ` Chuck Lever
  2013-11-14 20:43   ` Trond Myklebust
  2013-11-14 16:50 ` [PATCH 7/7] nfs: page cache invalidation for dio Christoph Hellwig
  6 siblings, 2 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

We'll need the i_mutex to prevent i_dio_count from incrementing while
truncate is waiting for it to reach zero, and protects against having
the pagecache repopulated after we flushed it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c32db2a..6cc7fe1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -500,16 +500,17 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
+	mutex_lock(&inode->i_mutex);
 	result = nfs_sync_mapping(mapping);
 	if (result)
-		goto out;
+		goto out_unlock;
 
 	task_io_account_read(count);
 
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (dreq == NULL)
-		goto out;
+		goto out_unlock;
 
 	dreq->inode = inode;
 	dreq->bytes_left = iov_length(iov, nr_segs);
@@ -525,13 +526,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 
 	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+
+	mutex_unlock(&inode->i_mutex);
+
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0)
 			iocb->ki_pos = pos + result;
 	}
+
+	nfs_direct_req_release(dreq);
+	return result;
+
 out_release:
 	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
 out:
 	return result;
 }
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
                   ` (5 preceding siblings ...)
  2013-11-14 16:50 ` [PATCH 6/7] nfs: take i_mutex during direct I/O reads Christoph Hellwig
@ 2013-11-14 16:50 ` Christoph Hellwig
  2013-11-14 18:35   ` Jeff Layton
  6 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-14 16:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: linux-nfs

Make sure to properly invalidate the pagecache before performing direct I/O,
so that no stale pages are left around.  This matches what the generic
direct I/O code does.  Also take the i_mutex over the direct write submission
to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and
to avoid having the pagecache easily repopulated while direct I/O is in
progrss.  Again matching the generic direct I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6cc7fe1..2b778fc 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -939,9 +939,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
+	loff_t end;
 	size_t count;
 
 	count = iov_length(iov, nr_segs);
+	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
@@ -958,16 +961,25 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
+	mutex_lock(&inode->i_mutex);
+
 	result = nfs_sync_mapping(mapping);
 	if (result)
-		goto out;
+		goto out_unlock;
+
+	if (mapping->nrpages) {
+		result = invalidate_inode_pages2_range(mapping,
+					pos >> PAGE_CACHE_SHIFT, end);
+		if (result)
+			goto out_unlock;
+	}
 
 	task_io_account_write(count);
 
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		goto out;
+		goto out_unlock;
 
 	dreq->inode = inode;
 	dreq->bytes_left = count;
@@ -982,6 +994,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT, end);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0) {
@@ -994,8 +1014,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 			spin_unlock(&inode->i_lock);
 		}
 	}
+	nfs_direct_req_release(dreq);
+	return result;
+
 out_release:
 	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
 out:
 	return result;
 }
-- 
1.7.10.4



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-14 16:50 ` [PATCH 6/7] nfs: take i_mutex during direct I/O reads Christoph Hellwig
@ 2013-11-14 17:00   ` Chuck Lever
  2013-11-15 14:29     ` Christoph Hellwig
  2013-11-14 20:43   ` Trond Myklebust
  1 sibling, 1 reply; 37+ messages in thread
From: Chuck Lever @ 2013-11-14 17:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Trond.Myklebust, linux-nfs

Hi Christoph-

On Nov 14, 2013, at 11:50 AM, Christoph Hellwig <hch@infradead.org> wrote:

> We'll need the i_mutex to prevent i_dio_count from incrementing while
> truncate is waiting for it to reach zero, and protects against having
> the pagecache repopulated after we flushed it.

How was the performance impact of this change measured?


> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/nfs/direct.c |   14 ++++++++++++--
> 1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index c32db2a..6cc7fe1 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -500,16 +500,17 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
> 	if (!count)
> 		goto out;
> 
> +	mutex_lock(&inode->i_mutex);
> 	result = nfs_sync_mapping(mapping);
> 	if (result)
> -		goto out;
> +		goto out_unlock;
> 
> 	task_io_account_read(count);
> 
> 	result = -ENOMEM;
> 	dreq = nfs_direct_req_alloc();
> 	if (dreq == NULL)
> -		goto out;
> +		goto out_unlock;
> 
> 	dreq->inode = inode;
> 	dreq->bytes_left = iov_length(iov, nr_segs);
> @@ -525,13 +526,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
> 
> 	NFS_I(inode)->read_io += iov_length(iov, nr_segs);
> 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
> +
> +	mutex_unlock(&inode->i_mutex);
> +
> 	if (!result) {
> 		result = nfs_direct_wait(dreq);
> 		if (result > 0)
> 			iocb->ki_pos = pos + result;
> 	}
> +
> +	nfs_direct_req_release(dreq);
> +	return result;
> +
> out_release:
> 	nfs_direct_req_release(dreq);
> +out_unlock:
> +	mutex_unlock(&inode->i_mutex);
> out:
> 	return result;
> }
> -- 
> 1.7.10.4
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-14 16:50 ` [PATCH 7/7] nfs: page cache invalidation for dio Christoph Hellwig
@ 2013-11-14 18:35   ` Jeff Layton
  2013-11-15 14:28     ` Christoph Hellwig
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2013-11-14 18:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Trond.Myklebust, linux-nfs

On Thu, 14 Nov 2013 08:50:34 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> Make sure to properly invalidate the pagecache before performing direct I/O,
> so that no stale pages are left around.  This matches what the generic
> direct I/O code does.  Also take the i_mutex over the direct write submission
> to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and
> to avoid having the pagecache easily repopulated while direct I/O is in
> progrss.  Again matching the generic direct I/O code.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/nfs/direct.c |   29 +++++++++++++++++++++++++++--
>  1 file changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 6cc7fe1..2b778fc 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -939,9 +939,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  	struct inode *inode = mapping->host;
>  	struct nfs_direct_req *dreq;
>  	struct nfs_lock_context *l_ctx;
> +	loff_t end;
>  	size_t count;
>  
>  	count = iov_length(iov, nr_segs);
> +	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
> +
>  	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
>  
>  	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
> @@ -958,16 +961,25 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  	if (!count)
>  		goto out;
>  
> +	mutex_lock(&inode->i_mutex);
> +
>  	result = nfs_sync_mapping(mapping);
>  	if (result)
> -		goto out;
> +		goto out_unlock;
> +
> +	if (mapping->nrpages) {
> +		result = invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_CACHE_SHIFT, end);
> +		if (result)
> +			goto out_unlock;
> +	}
>  
>  	task_io_account_write(count);
>  
>  	result = -ENOMEM;
>  	dreq = nfs_direct_req_alloc();
>  	if (!dreq)
> -		goto out;
> +		goto out_unlock;
>  
>  	dreq->inode = inode;
>  	dreq->bytes_left = count;
> @@ -982,6 +994,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  		dreq->iocb = iocb;
>  
>  	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
> +	
> +	if (mapping->nrpages) {
> +		invalidate_inode_pages2_range(mapping,
> +					      pos >> PAGE_CACHE_SHIFT, end);
> +	}
> +
> +	mutex_unlock(&inode->i_mutex);
> +
>  	if (!result) {
>  		result = nfs_direct_wait(dreq);
>  		if (result > 0) {
> @@ -994,8 +1014,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  			spin_unlock(&inode->i_lock);
>  		}
>  	}
> +	nfs_direct_req_release(dreq);
> +	return result;
> +
>  out_release:
>  	nfs_direct_req_release(dreq);
> +out_unlock:
> +	mutex_unlock(&inode->i_mutex);
>  out:
>  	return result;
>  }

Hrm... I started chasing down a bug reported by our QA group last week
that's showing up when you mix DIO writes and buffered reads
(basically, diotest3 in the LTP suite is failing). The bug is marked
private for dumb reasons but I'll see if I can make it public. I'll
also plan to give this series a spin to see if it helps fix that bug...

In any case, the DIO write code calls nfs_zap_mapping after it gets the
WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
read() calls from getting data out of the cache after the write reply
comes in.

Why is that not sufficient here?

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-14 16:50 ` [PATCH 6/7] nfs: take i_mutex during direct I/O reads Christoph Hellwig
  2013-11-14 17:00   ` Chuck Lever
@ 2013-11-14 20:43   ` Trond Myklebust
  2013-11-15 14:32     ` Christoph Hellwig
  1 sibling, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2013-11-14 20:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux NFS Mailing List


On Nov 14, 2013, at 11:50, Christoph Hellwig <hch@infradead.org> wrote:

> We'll need the i_mutex to prevent i_dio_count from incrementing while
> truncate is waiting for it to reach zero, and protects against having
> the pagecache repopulated after we flushed it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Hi Christoph,

Why do we need to protect the page cache here? Using O_DIRECT and the page cache without some kind of application-specific synchronization method is technically not supported, since that violates close-to-open cache consistency.
What we _do_ want to support, though, is parallel reads and writes to the server by applications that know what they are doing. If we were to only protect the i_dio_count, then we could fix the truncate race, while continuing to allow parallelism. Is there any reason why we cannot do this?

Cheers
  Trond

PS: I appear to be missing 7/7 in this patch series. Should I have seen it?

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-14 18:35   ` Jeff Layton
@ 2013-11-15 14:28     ` Christoph Hellwig
  2013-11-15 14:52       ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 14:28 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Trond.Myklebust, linux-nfs

On Thu, Nov 14, 2013 at 01:35:51PM -0500, Jeff Layton wrote:
> Hrm... I started chasing down a bug reported by our QA group last week
> that's showing up when you mix DIO writes and buffered reads
> (basically, diotest3 in the LTP suite is failing). The bug is marked
> private for dumb reasons but I'll see if I can make it public. I'll
> also plan to give this series a spin to see if it helps fix that bug...
> 
> In any case, the DIO write code calls nfs_zap_mapping after it gets the
> WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
> read() calls from getting data out of the cache after the write reply
> comes in.
> 
> Why is that not sufficient here?

Sounds like it should actually be fine, although I had similar testcases
fail.  I didn't even notice we were doing the invalidation, but delaying
it.  Can't see how that helps when bringing mmap into the game, although
that was always an best effort and pray that it works scenario.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-14 17:00   ` Chuck Lever
@ 2013-11-15 14:29     ` Christoph Hellwig
  0 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 14:29 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Christoph Hellwig, Trond.Myklebust, linux-nfs

On Thu, Nov 14, 2013 at 12:00:06PM -0500, Chuck Lever wrote:
> > We'll need the i_mutex to prevent i_dio_count from incrementing while
> > truncate is waiting for it to reach zero, and protects against having
> > the pagecache repopulated after we flushed it.
> 
> How was the performance impact of this change measured?

I've only done correctness tests.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-14 20:43   ` Trond Myklebust
@ 2013-11-15 14:32     ` Christoph Hellwig
  2013-11-15 15:23       ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 14:32 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Thu, Nov 14, 2013 at 03:43:21PM -0500, Trond Myklebust wrote:
> 
> Hi Christoph,
> 
> Why do we need to protect the page cache here? Using O_DIRECT and the page cache without some kind of application-specific synchronization method is technically not supported, since that violates close-to-open cache consistency.

As far as I understand close to open matter for synchronizations with
access from different clients.  Linux applications do expect tight
synchronization locally.

> What we _do_ want to support, though, is parallel reads and writes to the server by applications that know what they are doing. If we were to only protect the i_dio_count, then we could fix the truncate race, while continuing to allow parallelism. Is there any reason why we cannot do this?

To just fix that race we could do it.  Note that even with the patch we
only do the setup and I/O submission under the lock, and wait for it
outside, similar to what the local filesystems do for aio, but not for
synchronous direct I/O.

Another good way to speed this up is to use locking scheme XFS uses with
a shared exclusive lock:

buffered write:			exclusive
buffered read:			shared
direct I/O			exclusive if pagecache is present, then
				demote to shared for actual I/O,
				shared only if no pages are present on
				the file


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-15 14:28     ` Christoph Hellwig
@ 2013-11-15 14:52       ` Jeff Layton
  2013-11-15 15:02         ` Christoph Hellwig
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2013-11-15 14:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Trond.Myklebust, linux-nfs

On Fri, 15 Nov 2013 06:28:47 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Thu, Nov 14, 2013 at 01:35:51PM -0500, Jeff Layton wrote:
> > Hrm... I started chasing down a bug reported by our QA group last week
> > that's showing up when you mix DIO writes and buffered reads
> > (basically, diotest3 in the LTP suite is failing). The bug is marked
> > private for dumb reasons but I'll see if I can make it public. I'll
> > also plan to give this series a spin to see if it helps fix that bug...
> > 
> > In any case, the DIO write code calls nfs_zap_mapping after it gets the
> > WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
> > read() calls from getting data out of the cache after the write reply
> > comes in.
> > 
> > Why is that not sufficient here?
> 
> Sounds like it should actually be fine, although I had similar testcases
> fail.  I didn't even notice we were doing the invalidation, but delaying
> it.  Can't see how that helps when bringing mmap into the game, although
> that was always an best effort and pray that it works scenario.
>

Ok, cool. The bug that I've been looking at with Trond's help is here:

    https://bugzilla.redhat.com/show_bug.cgi?id=919382

Do you have these patches in a git tree someplace? If so, I wouldn't
mind running this reproducer against it to see if it helps. It's a bit
of a longshot, but what the heck...
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-15 14:52       ` Jeff Layton
@ 2013-11-15 15:02         ` Christoph Hellwig
  2013-11-15 15:33           ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 15:02 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Trond.Myklebust, linux-nfs

On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> Do you have these patches in a git tree someplace? If so, I wouldn't
> mind running this reproducer against it to see if it helps. It's a bit
> of a longshot, but what the heck...

While I do have a local git tree I don't really have anywhere to push
it to.  But applying these patches shouldn't be all that hard.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-15 14:32     ` Christoph Hellwig
@ 2013-11-15 15:23       ` Trond Myklebust
  2013-11-15 15:25         ` Christoph Hellwig
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2013-11-15 15:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux NFS Mailing List


On Nov 15, 2013, at 9:32, Christoph Hellwig <hch@infradead.org> wrote:

> On Thu, Nov 14, 2013 at 03:43:21PM -0500, Trond Myklebust wrote:
>> 
>> Hi Christoph,
>> 
>> Why do we need to protect the page cache here? Using O_DIRECT and the page cache without some kind of application-specific synchronization method is technically not supported, since that violates close-to-open cache consistency.
> 
> As far as I understand close to open matter for synchronizations with
> access from different clients.  Linux applications do expect tight
> synchronization locally.

Yes, but it seems to me that if you’re adding locking in order to synchronize O_DIRECT with the page cache, then something is very wrong. It is hard to close the races, and you are really mixing 2 models that shouldn’t coexist.

It seems a lot more correct in that case to have the kernel convert those O_DIRECT writes into O_SYNC if we really need page cache consistency. That’s the only way we can really _guarantee_ that consistency.

>> What we _do_ want to support, though, is parallel reads and writes to the server by applications that know what they are doing. If we were to only protect the i_dio_count, then we could fix the truncate race, while continuing to allow parallelism. Is there any reason why we cannot do this?
> 
> To just fix that race we could do it.  Note that even with the patch we
> only do the setup and I/O submission under the lock, and wait for it
> outside, similar to what the local filesystems do for aio, but not for
> synchronous direct I/O.
> 
> Another good way to speed this up is to use locking scheme XFS uses with
> a shared exclusive lock:
> 
> buffered write:			exclusive
> buffered read:			shared
> direct I/O			exclusive if pagecache is present, then
> 				demote to shared for actual I/O,
> 				shared only if no pages are present on
> 				the file

I think we should try to close the truncate race first. Then let’s look at whether or not people need better consistency with the page cache.

Cheers,
  Trond

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-15 15:23       ` Trond Myklebust
@ 2013-11-15 15:25         ` Christoph Hellwig
  2013-11-15 15:34           ` Trond Myklebust
  2013-11-15 16:00           ` Trond Myklebust
  0 siblings, 2 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 15:25 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Fri, Nov 15, 2013 at 10:23:01AM -0500, Trond Myklebust wrote:
> > buffered read:			shared
> > direct I/O			exclusive if pagecache is present, then
> > 				demote to shared for actual I/O,
> > 				shared only if no pages are present on
> > 				the file
> 
> I think we should try to close the truncate race first. Then let?s look at whether or not people need better consistency with the page cache.

With the current locking model that will require the i_mutex in both
read and write around the i_dio_count increment.  The XFS model avoids
taking an exclusive lock for the direct I/O fast path.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-15 15:02         ` Christoph Hellwig
@ 2013-11-15 15:33           ` Jeff Layton
  2014-01-21 19:21             ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2013-11-15 15:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Trond.Myklebust, linux-nfs

On Fri, 15 Nov 2013 07:02:04 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> > Do you have these patches in a git tree someplace? If so, I wouldn't
> > mind running this reproducer against it to see if it helps. It's a bit
> > of a longshot, but what the heck...
> 
> While I do have a local git tree I don't really have anywhere to push
> it to.  But applying these patches shouldn't be all that hard.
> 

It's not -- I'm just lazy...

FWIW, I tried this set and it didn't make any difference on the bug, so
I'll just keep soldiering on to track it down...

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-15 15:25         ` Christoph Hellwig
@ 2013-11-15 15:34           ` Trond Myklebust
  2013-11-15 15:37             ` Christoph Hellwig
  2013-11-15 16:00           ` Trond Myklebust
  1 sibling, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2013-11-15 15:34 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux NFS Mailing List


On Nov 15, 2013, at 10:25, Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Nov 15, 2013 at 10:23:01AM -0500, Trond Myklebust wrote:
>>> buffered read:			shared
>>> direct I/O			exclusive if pagecache is present, then
>>> 				demote to shared for actual I/O,
>>> 				shared only if no pages are present on
>>> 				the file
>> 
>> I think we should try to close the truncate race first. Then let?s look at whether or not people need better consistency with the page cache.
> 
> With the current locking model that will require the i_mutex in both
> read and write around the i_dio_count increment.  The XFS model avoids
> taking an exclusive lock for the direct I/O fast path.
> 

Is that the xfs_ilock vs xfs_rw_ilock code?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-15 15:34           ` Trond Myklebust
@ 2013-11-15 15:37             ` Christoph Hellwig
  0 siblings, 0 replies; 37+ messages in thread
From: Christoph Hellwig @ 2013-11-15 15:37 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, Linux NFS Mailing List

On Fri, Nov 15, 2013 at 10:34:52AM -0500, Trond Myklebust wrote:
> > With the current locking model that will require the i_mutex in both
> > read and write around the i_dio_count increment.  The XFS model avoids
> > taking an exclusive lock for the direct I/O fast path.
> > 
> 
> Is that the xfs_ilock vs xfs_rw_ilock code?

xfs_rw_ilock is where you should lock, that are the helpers around
the xfs iolock which is the shared/exclusive lock and the i_mutex
which we still have to take in the exclusive case as the VFS expects it.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 6/7] nfs: take i_mutex during direct I/O reads
  2013-11-15 15:25         ` Christoph Hellwig
  2013-11-15 15:34           ` Trond Myklebust
@ 2013-11-15 16:00           ` Trond Myklebust
  1 sibling, 0 replies; 37+ messages in thread
From: Trond Myklebust @ 2013-11-15 16:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Linux NFS Mailing List


On Nov 15, 2013, at 10:25, Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Nov 15, 2013 at 10:23:01AM -0500, Trond Myklebust wrote:
>>> buffered read:			shared
>>> direct I/O			exclusive if pagecache is present, then
>>> 				demote to shared for actual I/O,
>>> 				shared only if no pages are present on
>>> 				the file
>> 
>> I think we should try to close the truncate race first. Then let?s look at whether or not people need better consistency with the page cache.
> 
> With the current locking model that will require the i_mutex in both
> read and write around the i_dio_count increment.  The XFS model avoids
> taking an exclusive lock for the direct I/O fast path.

OK, so afaics it is basically a wrapper around a 2 lock system: an rw_semaphore and the i_mutex. That means adding an rw_semaphore to the nfs_inode; we unfortunately cannot use the existing rw_sem since that is taken during lock recovery and so could deadlock if we take it during i/o.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2013-11-15 15:33           ` Jeff Layton
@ 2014-01-21 19:21             ` Jeff Layton
  2014-01-22  8:24               ` Christoph Hellwig
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-21 19:21 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Trond.Myklebust, linux-nfs

On Fri, 15 Nov 2013 10:33:24 -0500
Jeff Layton <jlayton@redhat.com> wrote:

> On Fri, 15 Nov 2013 07:02:04 -0800
> Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> > > Do you have these patches in a git tree someplace? If so, I wouldn't
> > > mind running this reproducer against it to see if it helps. It's a bit
> > > of a longshot, but what the heck...
> > 
> > While I do have a local git tree I don't really have anywhere to push
> > it to.  But applying these patches shouldn't be all that hard.
> > 
> 
> It's not -- I'm just lazy...
> 
> FWIW, I tried this set and it didn't make any difference on the bug, so
> I'll just keep soldiering on to track it down...
> 

I just tried this set again, and it *did* seem to help that bug.

I think the reason it didn't before was that I had applied this set on
top of a tree that held a different patch that introduced a race in the
nfs_revalidate_mapping() code.

In any case, this helps but it's a little odd. With this patch, you add
an invalidate_inode_pages2 call prior to doing the DIO. But, you've
also left in the call to nfs_zap_mapping in the completion codepath.

So now, we shoot down the mapping prior to doing a DIO write, and then
mark the mapping for invalidation again when the write completes. Was
that intentional?

It seems a little excessive and might hurt performance in some cases.
OTOH, if you mix buffered and DIO you're asking for trouble anyway and
this approach seems to give better cache coherency.
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-21 19:21             ` Jeff Layton
@ 2014-01-22  8:24               ` Christoph Hellwig
  2014-01-22 12:04                 ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Christoph Hellwig @ 2014-01-22  8:24 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Trond.Myklebust, linux-nfs

On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> In any case, this helps but it's a little odd. With this patch, you add
> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> also left in the call to nfs_zap_mapping in the completion codepath.
> 
> So now, we shoot down the mapping prior to doing a DIO write, and then
> mark the mapping for invalidation again when the write completes. Was
> that intentional?
> 
> It seems a little excessive and might hurt performance in some cases.
> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> this approach seems to give better cache coherency.

Thile follows the model implemented and documented in
generic_file_direct_write().


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-22  8:24               ` Christoph Hellwig
@ 2014-01-22 12:04                 ` Jeff Layton
  2014-01-24 15:50                   ` Jeff Layton
  2014-01-24 15:52                   ` Jeff Layton
  0 siblings, 2 replies; 37+ messages in thread
From: Jeff Layton @ 2014-01-22 12:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Trond.Myklebust, linux-nfs

On Wed, 22 Jan 2014 00:24:14 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > In any case, this helps but it's a little odd. With this patch, you add
> > an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > also left in the call to nfs_zap_mapping in the completion codepath.
> > 
> > So now, we shoot down the mapping prior to doing a DIO write, and then
> > mark the mapping for invalidation again when the write completes. Was
> > that intentional?
> > 
> > It seems a little excessive and might hurt performance in some cases.
> > OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > this approach seems to give better cache coherency.
> 
> Thile follows the model implemented and documented in
> generic_file_direct_write().
> 

Ok, thanks. That makes sense, and the problem described in those
comments is almost exactly the one I've seen in practice.

I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
flag is handled, but that really has nothing to do with this patchset.

You can add my Tested-by to the set if you like...

Cheers,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-22 12:04                 ` Jeff Layton
@ 2014-01-24 15:50                   ` Jeff Layton
  2014-01-24 15:52                   ` Jeff Layton
  1 sibling, 0 replies; 37+ messages in thread
From: Jeff Layton @ 2014-01-24 15:50 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: Christoph Hellwig, linux-nfs

On Wed, 22 Jan 2014 07:04:09 -0500
Jeff Layton <jlayton@redhat.com> wrote:

> On Wed, 22 Jan 2014 00:24:14 -0800
> Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > > In any case, this helps but it's a little odd. With this patch, you add
> > > an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > > also left in the call to nfs_zap_mapping in the completion codepath.
> > > 
> > > So now, we shoot down the mapping prior to doing a DIO write, and then
> > > mark the mapping for invalidation again when the write completes. Was
> > > that intentional?
> > > 
> > > It seems a little excessive and might hurt performance in some cases.
> > > OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > > this approach seems to give better cache coherency.
> > 
> > Thile follows the model implemented and documented in
> > generic_file_direct_write().
> > 
> 
> Ok, thanks. That makes sense, and the problem described in those
> comments is almost exactly the one I've seen in practice.
> 
> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> flag is handled, but that really has nothing to do with this patchset.
> 
> You can add my Tested-by to the set if you like...
> 

I may have spoken too soon...

This patchset didn't fix the problem once I cranked up the concurrency
from 100 child tasks to 1000. I think that HCH's patchset makes sense
and helps narrow the race window some, but the way that
nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.

The following patch does seem to fix it however. It's a combination of
a test patch that Trond gave me a while back and another change to
serialize the nfs_invalidate_mapping ops.

I think it's a reasonable approach to deal with the problem, but we
likely have some other areas that will need similar treatment since
they also check NFS_INO_INVALID_DATA: 

    nfs_write_pageuptodate
    nfs_readdir_search_for_cookie
    nfs_update_inode

Trond, thoughts? It's not quite ready for merge, but I'd like to get an
opinion on the basic approach, or whether you have an idea of how
better handle the races here:

------------------8<--------------------

NFS: fix the handling of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping

There is a possible race in how the nfs_invalidate_mapping is handled.
Currently, we go and invalidate the pages in the file and then clear
NFS_INO_INVALID_DATA.

The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.

So, we must clear the flag first and then invalidate the pages.  This
however, opens another race:

It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.

Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it sees that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.

This effect is easily manifested by running diotest3 from the LTP test
suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache instead of being done on the wire when it should. While mixing
direct and buffered I/O isn't recommended, I believe it's possible to
hit this in other ways that just use buffered I/O, even though that
makes it harder to reproduce.

The problem is that the checking/clearing of that flag and the
invalidation of the mapping need to be as a unit. Fix this by
serializing concurrent invalidations with a bitlock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/inode.c         | 32 +++++++++++++++++++++++++++-----
 include/linux/nfs_fs.h |  1 +
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2..6fa07e1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-	spin_unlock(&inode->i_lock);
+		spin_unlock(&inode->i_lock);
+	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
@@ -1007,6 +1007,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	int ret = 0;
 
 	/* swapfiles are not supposed to be shared. */
@@ -1018,12 +1019,33 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 		if (ret < 0)
 			goto out;
 	}
+
+	/*
+	 * We must clear NFS_INO_INVALID_DATA first to ensure that
+	 * invalidations that come in while we're shooting down the mappings
+	 * are respected. But, that leaves a race window where one revalidator
+	 * can clear the flag, and then another checks it before the mapping
+	 * gets invalidated. Fix that by serializing access to this part of
+	 * the function.
+	 */
+	ret = wait_on_bit_lock(bitlock, NFS_INO_INVALIDATING,
+				nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		goto out;
+
+	spin_lock(&inode->i_lock);
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
 		trace_nfs_invalidate_mapping_enter(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
 		trace_nfs_invalidate_mapping_exit(inode, ret);
-	}
+	} else
+		spin_unlock(&inode->i_lock);
 
+	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
 out:
 	return ret;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4899737..18fb16f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,6 +215,7 @@ struct nfs_inode {
 #define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
+#define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-22 12:04                 ` Jeff Layton
  2014-01-24 15:50                   ` Jeff Layton
@ 2014-01-24 15:52                   ` Jeff Layton
  2014-01-24 17:11                     ` Trond Myklebust
  1 sibling, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-24 15:52 UTC (permalink / raw)
  To: Trond.Myklebust; +Cc: Christoph Hellwig, linux-nfs

On Wed, 22 Jan 2014 07:04:09 -0500
Jeff Layton <jlayton@redhat.com> wrote:

> On Wed, 22 Jan 2014 00:24:14 -0800
> Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > > In any case, this helps but it's a little odd. With this patch, you add
> > > an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > > also left in the call to nfs_zap_mapping in the completion codepath.
> > > 
> > > So now, we shoot down the mapping prior to doing a DIO write, and then
> > > mark the mapping for invalidation again when the write completes. Was
> > > that intentional?
> > > 
> > > It seems a little excessive and might hurt performance in some cases.
> > > OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > > this approach seems to give better cache coherency.
> > 
> > Thile follows the model implemented and documented in
> > generic_file_direct_write().
> > 
> 
> Ok, thanks. That makes sense, and the problem described in those
> comments is almost exactly the one I've seen in practice.
> 
> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> flag is handled, but that really has nothing to do with this patchset.
> 
> You can add my Tested-by to the set if you like...
> 

(re-sending with Trond's address fixed)

I may have spoken too soon...

This patchset didn't fix the problem once I cranked up the concurrency
from 100 child tasks to 1000. I think that HCH's patchset makes sense
and helps narrow the race window some, but the way that
nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.

The following patch does seem to fix it however. It's a combination of
a test patch that Trond gave me a while back and another change to
serialize the nfs_invalidate_mapping ops.

I think it's a reasonable approach to deal with the problem, but we
likely have some other areas that will need similar treatment since
they also check NFS_INO_INVALID_DATA: 

    nfs_write_pageuptodate
    nfs_readdir_search_for_cookie
    nfs_update_inode

Trond, thoughts? It's not quite ready for merge, but I'd like to get an
opinion on the basic approach, or whether you have an idea of how
better handle the races here:

------------------8<--------------------

NFS: fix the handling of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping

There is a possible race in how the nfs_invalidate_mapping is handled.
Currently, we go and invalidate the pages in the file and then clear
NFS_INO_INVALID_DATA.

The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.

So, we must clear the flag first and then invalidate the pages.  This
however, opens another race:

It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.

Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it sees that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.

This effect is easily manifested by running diotest3 from the LTP test
suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache instead of being done on the wire when it should. While mixing
direct and buffered I/O isn't recommended, I believe it's possible to
hit this in other ways that just use buffered I/O, even though that
makes it harder to reproduce.

The problem is that the checking/clearing of that flag and the
invalidation of the mapping need to be as a unit. Fix this by
serializing concurrent invalidations with a bitlock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/inode.c         | 32 +++++++++++++++++++++++++++-----
 include/linux/nfs_fs.h |  1 +
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2..6fa07e1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-	spin_unlock(&inode->i_lock);
+		spin_unlock(&inode->i_lock);
+	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
@@ -1007,6 +1007,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &NFS_I(inode)->flags;
 	int ret = 0;
 
 	/* swapfiles are not supposed to be shared. */
@@ -1018,12 +1019,33 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 		if (ret < 0)
 			goto out;
 	}
+
+	/*
+	 * We must clear NFS_INO_INVALID_DATA first to ensure that
+	 * invalidations that come in while we're shooting down the mappings
+	 * are respected. But, that leaves a race window where one revalidator
+	 * can clear the flag, and then another checks it before the mapping
+	 * gets invalidated. Fix that by serializing access to this part of
+	 * the function.
+	 */
+	ret = wait_on_bit_lock(bitlock, NFS_INO_INVALIDATING,
+				nfs_wait_bit_killable, TASK_KILLABLE);
+	if (ret)
+		goto out;
+
+	spin_lock(&inode->i_lock);
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
 		trace_nfs_invalidate_mapping_enter(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
 		trace_nfs_invalidate_mapping_exit(inode, ret);
-	}
+	} else
+		spin_unlock(&inode->i_lock);
 
+	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
 out:
 	return ret;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4899737..18fb16f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,6 +215,7 @@ struct nfs_inode {
 #define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
+#define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 15:52                   ` Jeff Layton
@ 2014-01-24 17:11                     ` Trond Myklebust
  2014-01-24 17:29                       ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2014-01-24 17:11 UTC (permalink / raw)
  To: Layton Jeff; +Cc: Christoph Hellwig, linuxnfs


On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:

> On Wed, 22 Jan 2014 07:04:09 -0500
> Jeff Layton <jlayton@redhat.com> wrote:
> 
>> On Wed, 22 Jan 2014 00:24:14 -0800
>> Christoph Hellwig <hch@infradead.org> wrote:
>> 
>>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
>>>> In any case, this helps but it's a little odd. With this patch, you add
>>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
>>>> also left in the call to nfs_zap_mapping in the completion codepath.
>>>> 
>>>> So now, we shoot down the mapping prior to doing a DIO write, and then
>>>> mark the mapping for invalidation again when the write completes. Was
>>>> that intentional?
>>>> 
>>>> It seems a little excessive and might hurt performance in some cases.
>>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
>>>> this approach seems to give better cache coherency.
>>> 
>>> Thile follows the model implemented and documented in
>>> generic_file_direct_write().
>>> 
>> 
>> Ok, thanks. That makes sense, and the problem described in those
>> comments is almost exactly the one I've seen in practice.
>> 
>> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
>> flag is handled, but that really has nothing to do with this patchset.
>> 
>> You can add my Tested-by to the set if you like...
>> 
> 
> (re-sending with Trond's address fixed)
> 
> I may have spoken too soon...
> 
> This patchset didn't fix the problem once I cranked up the concurrency
> from 100 child tasks to 1000. I think that HCH's patchset makes sense
> and helps narrow the race window some, but the way that
> nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> 
> The following patch does seem to fix it however. It's a combination of
> a test patch that Trond gave me a while back and another change to
> serialize the nfs_invalidate_mapping ops.
> 
> I think it's a reasonable approach to deal with the problem, but we
> likely have some other areas that will need similar treatment since
> they also check NFS_INO_INVALID_DATA: 
> 
>    nfs_write_pageuptodate
>    nfs_readdir_search_for_cookie
>    nfs_update_inode
> 
> Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> opinion on the basic approach, or whether you have an idea of how
> better handle the races here:

I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.

Cheers
  Trond
--
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 17:11                     ` Trond Myklebust
@ 2014-01-24 17:29                       ` Jeff Layton
  2014-01-24 17:40                         ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-24 17:29 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, linuxnfs

On Fri, 24 Jan 2014 10:11:11 -0700
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> 
> On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:
> 
> > On Wed, 22 Jan 2014 07:04:09 -0500
> > Jeff Layton <jlayton@redhat.com> wrote:
> > 
> >> On Wed, 22 Jan 2014 00:24:14 -0800
> >> Christoph Hellwig <hch@infradead.org> wrote:
> >> 
> >>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> >>>> In any case, this helps but it's a little odd. With this patch, you add
> >>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> >>>> also left in the call to nfs_zap_mapping in the completion codepath.
> >>>> 
> >>>> So now, we shoot down the mapping prior to doing a DIO write, and then
> >>>> mark the mapping for invalidation again when the write completes. Was
> >>>> that intentional?
> >>>> 
> >>>> It seems a little excessive and might hurt performance in some cases.
> >>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> >>>> this approach seems to give better cache coherency.
> >>> 
> >>> Thile follows the model implemented and documented in
> >>> generic_file_direct_write().
> >>> 
> >> 
> >> Ok, thanks. That makes sense, and the problem described in those
> >> comments is almost exactly the one I've seen in practice.
> >> 
> >> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> >> flag is handled, but that really has nothing to do with this patchset.
> >> 
> >> You can add my Tested-by to the set if you like...
> >> 
> > 
> > (re-sending with Trond's address fixed)
> > 
> > I may have spoken too soon...
> > 
> > This patchset didn't fix the problem once I cranked up the concurrency
> > from 100 child tasks to 1000. I think that HCH's patchset makes sense
> > and helps narrow the race window some, but the way that
> > nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> > 
> > The following patch does seem to fix it however. It's a combination of
> > a test patch that Trond gave me a while back and another change to
> > serialize the nfs_invalidate_mapping ops.
> > 
> > I think it's a reasonable approach to deal with the problem, but we
> > likely have some other areas that will need similar treatment since
> > they also check NFS_INO_INVALID_DATA: 
> > 
> >    nfs_write_pageuptodate
> >    nfs_readdir_search_for_cookie
> >    nfs_update_inode
> > 
> > Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> > opinion on the basic approach, or whether you have an idea of how
> > better handle the races here:
> 
> I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
> Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.
> 


nfs_write_pageuptodate does this:

---------------8<-----------------
        if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
                return false;
out:
        return PageUptodate(page) != 0;
---------------8<-----------------

With the proposed patch, NFS_INO_INVALID_DATA would be cleared first and
only later would the page be invalidated. So, there's a race window in
there where the bit could be cleared but the page flag is still set,
even though it's on its way out the cache. So, I think we'd need to do
some similar sort of locking in there to make sure that doesn't happen.

nfs_update_inode just does this:

        if (invalid & NFS_INO_INVALID_DATA)
                nfs_fscache_invalidate(inode);

...again, since we clear the bit first with this patch, I think we have
a potential race window there too. We might not see it set in a
situation where we would have before. That case is a bit more
problematic since we can't sleep to wait on the bitlock there.

It might be best to just get rid of that call altogether and move it
into nfs_invalidate_mapping. It seems to me that we ought to just
handle fscache the same way we do the pagecache when it comes to
invalidation.

As far as the readdir code goes, I haven't looked as closely at that
yet. I just noticed that it checked for NFS_INO_INVALID_DATA. Once we
settle the other two cases, I'll give that closer scrutiny.

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 17:29                       ` Jeff Layton
@ 2014-01-24 17:40                         ` Trond Myklebust
  2014-01-24 18:00                           ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2014-01-24 17:40 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, linuxnfs

On Fri, 2014-01-24 at 12:29 -0500, Jeff Layton wrote:
> On Fri, 24 Jan 2014 10:11:11 -0700
> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> 
> > 
> > On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:
> > 
> > > On Wed, 22 Jan 2014 07:04:09 -0500
> > > Jeff Layton <jlayton@redhat.com> wrote:
> > > 
> > >> On Wed, 22 Jan 2014 00:24:14 -0800
> > >> Christoph Hellwig <hch@infradead.org> wrote:
> > >> 
> > >>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > >>>> In any case, this helps but it's a little odd. With this patch, you add
> > >>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > >>>> also left in the call to nfs_zap_mapping in the completion codepath.
> > >>>> 
> > >>>> So now, we shoot down the mapping prior to doing a DIO write, and then
> > >>>> mark the mapping for invalidation again when the write completes. Was
> > >>>> that intentional?
> > >>>> 
> > >>>> It seems a little excessive and might hurt performance in some cases.
> > >>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > >>>> this approach seems to give better cache coherency.
> > >>> 
> > >>> Thile follows the model implemented and documented in
> > >>> generic_file_direct_write().
> > >>> 
> > >> 
> > >> Ok, thanks. That makes sense, and the problem described in those
> > >> comments is almost exactly the one I've seen in practice.
> > >> 
> > >> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> > >> flag is handled, but that really has nothing to do with this patchset.
> > >> 
> > >> You can add my Tested-by to the set if you like...
> > >> 
> > > 
> > > (re-sending with Trond's address fixed)
> > > 
> > > I may have spoken too soon...
> > > 
> > > This patchset didn't fix the problem once I cranked up the concurrency
> > > from 100 child tasks to 1000. I think that HCH's patchset makes sense
> > > and helps narrow the race window some, but the way that
> > > nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> > > 
> > > The following patch does seem to fix it however. It's a combination of
> > > a test patch that Trond gave me a while back and another change to
> > > serialize the nfs_invalidate_mapping ops.
> > > 
> > > I think it's a reasonable approach to deal with the problem, but we
> > > likely have some other areas that will need similar treatment since
> > > they also check NFS_INO_INVALID_DATA: 
> > > 
> > >    nfs_write_pageuptodate
> > >    nfs_readdir_search_for_cookie
> > >    nfs_update_inode
> > > 
> > > Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> > > opinion on the basic approach, or whether you have an idea of how
> > > better handle the races here:
> > 
> > I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
> > Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.
> > 
> 
> 
> nfs_write_pageuptodate does this:
> 
> ---------------8<-----------------
>         if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
>                 return false;
> out:
>         return PageUptodate(page) != 0;
> ---------------8<-----------------
> 
> With the proposed patch, NFS_INO_INVALID_DATA would be cleared first and
> only later would the page be invalidated. So, there's a race window in
> there where the bit could be cleared but the page flag is still set,
> even though it's on its way out the cache. So, I think we'd need to do
> some similar sort of locking in there to make sure that doesn't happen.

We _cannot_ lock against nfs_revalidate_mapping() here, because we could
end up deadlocking with invalidate_inode_pages2().

If you like, we could add a test for NFS_INO_INVALIDATING, to turn off
the optimisation in that case, but I'd like to understand what the race
would be: don't forget that the page is marked as PageUptodate(), which
means that either invalidate_inode_pages2() has not yet reached this
page, or that a read of the page succeeded after the invalidation was
made.

> nfs_update_inode just does this:
> 
>         if (invalid & NFS_INO_INVALID_DATA)
>                 nfs_fscache_invalidate(inode);
> 
> ...again, since we clear the bit first with this patch, I think we have
> a potential race window there too. We might not see it set in a
> situation where we would have before. That case is a bit more
> problematic since we can't sleep to wait on the bitlock there.

Umm... That test in nfs_update_inode() is there because we might just
have _set_ the NFS_INO_INVALID_DATA bit.

> 
> It might be best to just get rid of that call altogether and move it
> into nfs_invalidate_mapping. It seems to me that we ought to just
> handle fscache the same way we do the pagecache when it comes to
> invalidation.
> 
> As far as the readdir code goes, I haven't looked as closely at that
> yet. I just noticed that it checked for NFS_INO_INVALID_DATA. Once we
> settle the other two cases, I'll give that closer scrutiny.
> 
> Thanks,


-- 
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 17:40                         ` Trond Myklebust
@ 2014-01-24 18:00                           ` Jeff Layton
  2014-01-24 18:46                             ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-24 18:00 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, linuxnfs

On Fri, 24 Jan 2014 10:40:06 -0700
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> On Fri, 2014-01-24 at 12:29 -0500, Jeff Layton wrote:
> > On Fri, 24 Jan 2014 10:11:11 -0700
> > Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> > 
> > > 
> > > On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:
> > > 
> > > > On Wed, 22 Jan 2014 07:04:09 -0500
> > > > Jeff Layton <jlayton@redhat.com> wrote:
> > > > 
> > > >> On Wed, 22 Jan 2014 00:24:14 -0800
> > > >> Christoph Hellwig <hch@infradead.org> wrote:
> > > >> 
> > > >>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > > >>>> In any case, this helps but it's a little odd. With this patch, you add
> > > >>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > > >>>> also left in the call to nfs_zap_mapping in the completion codepath.
> > > >>>> 
> > > >>>> So now, we shoot down the mapping prior to doing a DIO write, and then
> > > >>>> mark the mapping for invalidation again when the write completes. Was
> > > >>>> that intentional?
> > > >>>> 
> > > >>>> It seems a little excessive and might hurt performance in some cases.
> > > >>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > > >>>> this approach seems to give better cache coherency.
> > > >>> 
> > > >>> Thile follows the model implemented and documented in
> > > >>> generic_file_direct_write().
> > > >>> 
> > > >> 
> > > >> Ok, thanks. That makes sense, and the problem described in those
> > > >> comments is almost exactly the one I've seen in practice.
> > > >> 
> > > >> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> > > >> flag is handled, but that really has nothing to do with this patchset.
> > > >> 
> > > >> You can add my Tested-by to the set if you like...
> > > >> 
> > > > 
> > > > (re-sending with Trond's address fixed)
> > > > 
> > > > I may have spoken too soon...
> > > > 
> > > > This patchset didn't fix the problem once I cranked up the concurrency
> > > > from 100 child tasks to 1000. I think that HCH's patchset makes sense
> > > > and helps narrow the race window some, but the way that
> > > > nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> > > > 
> > > > The following patch does seem to fix it however. It's a combination of
> > > > a test patch that Trond gave me a while back and another change to
> > > > serialize the nfs_invalidate_mapping ops.
> > > > 
> > > > I think it's a reasonable approach to deal with the problem, but we
> > > > likely have some other areas that will need similar treatment since
> > > > they also check NFS_INO_INVALID_DATA: 
> > > > 
> > > >    nfs_write_pageuptodate
> > > >    nfs_readdir_search_for_cookie
> > > >    nfs_update_inode
> > > > 
> > > > Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> > > > opinion on the basic approach, or whether you have an idea of how
> > > > better handle the races here:
> > > 
> > > I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
> > > Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.
> > > 
> > 
> > 
> > nfs_write_pageuptodate does this:
> > 
> > ---------------8<-----------------
> >         if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
> >                 return false;
> > out:
> >         return PageUptodate(page) != 0;
> > ---------------8<-----------------
> > 
> > With the proposed patch, NFS_INO_INVALID_DATA would be cleared first and
> > only later would the page be invalidated. So, there's a race window in
> > there where the bit could be cleared but the page flag is still set,
> > even though it's on its way out the cache. So, I think we'd need to do
> > some similar sort of locking in there to make sure that doesn't happen.
> 
> We _cannot_ lock against nfs_revalidate_mapping() here, because we could
> end up deadlocking with invalidate_inode_pages2().
> 
> If you like, we could add a test for NFS_INO_INVALIDATING, to turn off
> the optimisation in that case, but I'd like to understand what the race
> would be: don't forget that the page is marked as PageUptodate(), which
> means that either invalidate_inode_pages2() has not yet reached this
> page, or that a read of the page succeeded after the invalidation was
> made.
> 

Right. The first situation seems wrong to me. We've marked the file as
INVALID and then cleared the bit to start the process of invalidating
the actual pages. It seems like nfs_write_pageuptodate ought not return
true even if PageUptodate() is still set at that point.

We could check NFS_INO_INVALIDATING, but we might miss that
optimization in a lot of cases just because something happens to be
in nfs_revalidate_mapping. Maybe that means that this bitlock isn't
sufficient and we need some other mechanism. I'm not sure what that
should be though.

> > nfs_update_inode just does this:
> > 
> >         if (invalid & NFS_INO_INVALID_DATA)
> >                 nfs_fscache_invalidate(inode);
> > 
> > ...again, since we clear the bit first with this patch, I think we have
> > a potential race window there too. We might not see it set in a
> > situation where we would have before. That case is a bit more
> > problematic since we can't sleep to wait on the bitlock there.
> 
> Umm... That test in nfs_update_inode() is there because we might just
> have _set_ the NFS_INO_INVALID_DATA bit.
> 

Correct. But do we need to force a fscache invalidation at that point,
or can it wait until we're going to invalidate the mapping too?

> > 
> > It might be best to just get rid of that call altogether and move it
> > into nfs_invalidate_mapping. It seems to me that we ought to just
> > handle fscache the same way we do the pagecache when it comes to
> > invalidation.
> > 
> > As far as the readdir code goes, I haven't looked as closely at that
> > yet. I just noticed that it checked for NFS_INO_INVALID_DATA. Once we
> > settle the other two cases, I'll give that closer scrutiny.
> > 

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 18:00                           ` Jeff Layton
@ 2014-01-24 18:46                             ` Trond Myklebust
  2014-01-24 21:21                               ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2014-01-24 18:46 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, linuxnfs

On Fri, 2014-01-24 at 13:00 -0500, Jeff Layton wrote:
> On Fri, 24 Jan 2014 10:40:06 -0700
> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> 
> > On Fri, 2014-01-24 at 12:29 -0500, Jeff Layton wrote:
> > > On Fri, 24 Jan 2014 10:11:11 -0700
> > > Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> > > 
> > > > 
> > > > On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:
> > > > 
> > > > > On Wed, 22 Jan 2014 07:04:09 -0500
> > > > > Jeff Layton <jlayton@redhat.com> wrote:
> > > > > 
> > > > >> On Wed, 22 Jan 2014 00:24:14 -0800
> > > > >> Christoph Hellwig <hch@infradead.org> wrote:
> > > > >> 
> > > > >>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > > > >>>> In any case, this helps but it's a little odd. With this patch, you add
> > > > >>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > > > >>>> also left in the call to nfs_zap_mapping in the completion codepath.
> > > > >>>> 
> > > > >>>> So now, we shoot down the mapping prior to doing a DIO write, and then
> > > > >>>> mark the mapping for invalidation again when the write completes. Was
> > > > >>>> that intentional?
> > > > >>>> 
> > > > >>>> It seems a little excessive and might hurt performance in some cases.
> > > > >>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > > > >>>> this approach seems to give better cache coherency.
> > > > >>> 
> > > > >>> Thile follows the model implemented and documented in
> > > > >>> generic_file_direct_write().
> > > > >>> 
> > > > >> 
> > > > >> Ok, thanks. That makes sense, and the problem described in those
> > > > >> comments is almost exactly the one I've seen in practice.
> > > > >> 
> > > > >> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> > > > >> flag is handled, but that really has nothing to do with this patchset.
> > > > >> 
> > > > >> You can add my Tested-by to the set if you like...
> > > > >> 
> > > > > 
> > > > > (re-sending with Trond's address fixed)
> > > > > 
> > > > > I may have spoken too soon...
> > > > > 
> > > > > This patchset didn't fix the problem once I cranked up the concurrency
> > > > > from 100 child tasks to 1000. I think that HCH's patchset makes sense
> > > > > and helps narrow the race window some, but the way that
> > > > > nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> > > > > 
> > > > > The following patch does seem to fix it however. It's a combination of
> > > > > a test patch that Trond gave me a while back and another change to
> > > > > serialize the nfs_invalidate_mapping ops.
> > > > > 
> > > > > I think it's a reasonable approach to deal with the problem, but we
> > > > > likely have some other areas that will need similar treatment since
> > > > > they also check NFS_INO_INVALID_DATA: 
> > > > > 
> > > > >    nfs_write_pageuptodate
> > > > >    nfs_readdir_search_for_cookie
> > > > >    nfs_update_inode
> > > > > 
> > > > > Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> > > > > opinion on the basic approach, or whether you have an idea of how
> > > > > better handle the races here:
> > > > 
> > > > I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
> > > > Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.
> > > > 
> > > 
> > > 
> > > nfs_write_pageuptodate does this:
> > > 
> > > ---------------8<-----------------
> > >         if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
> > >                 return false;
> > > out:
> > >         return PageUptodate(page) != 0;
> > > ---------------8<-----------------
> > > 
> > > With the proposed patch, NFS_INO_INVALID_DATA would be cleared first and
> > > only later would the page be invalidated. So, there's a race window in
> > > there where the bit could be cleared but the page flag is still set,
> > > even though it's on its way out the cache. So, I think we'd need to do
> > > some similar sort of locking in there to make sure that doesn't happen.
> > 
> > We _cannot_ lock against nfs_revalidate_mapping() here, because we could
> > end up deadlocking with invalidate_inode_pages2().
> > 
> > If you like, we could add a test for NFS_INO_INVALIDATING, to turn off
> > the optimisation in that case, but I'd like to understand what the race
> > would be: don't forget that the page is marked as PageUptodate(), which
> > means that either invalidate_inode_pages2() has not yet reached this
> > page, or that a read of the page succeeded after the invalidation was
> > made.
> > 
> 
> Right. The first situation seems wrong to me. We've marked the file as
> INVALID and then cleared the bit to start the process of invalidating
> the actual pages. It seems like nfs_write_pageuptodate ought not return
> true even if PageUptodate() is still set at that point.
> 
> We could check NFS_INO_INVALIDATING, but we might miss that
> optimization in a lot of cases just because something happens to be
> in nfs_revalidate_mapping. Maybe that means that this bitlock isn't
> sufficient and we need some other mechanism. I'm not sure what that
> should be though.

Convert your patch to use wait_on_bit(), and then to call
wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.

> > > nfs_update_inode just does this:
> > > 
> > >         if (invalid & NFS_INO_INVALID_DATA)
> > >                 nfs_fscache_invalidate(inode);
> > > 
> > > ...again, since we clear the bit first with this patch, I think we have
> > > a potential race window there too. We might not see it set in a
> > > situation where we would have before. That case is a bit more
> > > problematic since we can't sleep to wait on the bitlock there.
> > 
> > Umm... That test in nfs_update_inode() is there because we might just
> > have _set_ the NFS_INO_INVALID_DATA bit.
> > 
> 
> Correct. But do we need to force a fscache invalidation at that point,
> or can it wait until we're going to invalidate the mapping too?

That's a question for David. My assumption is that since invalidation is
handled asynchronously by the fscache layer itself, that we need to let
it start that process as soon as possible, but perhaps these races are
an indication that we should actually do it at the time when we call
invalidate_inode_pages2() (or at the latest, when we're evicting the
inode from the icache)...


-- 
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 18:46                             ` Trond Myklebust
@ 2014-01-24 21:21                               ` Jeff Layton
  2014-01-25  0:39                                 ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-24 21:21 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, linuxnfs

On Fri, 24 Jan 2014 11:46:41 -0700
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> On Fri, 2014-01-24 at 13:00 -0500, Jeff Layton wrote:
> > On Fri, 24 Jan 2014 10:40:06 -0700
> > Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> > 
> > > On Fri, 2014-01-24 at 12:29 -0500, Jeff Layton wrote:
> > > > On Fri, 24 Jan 2014 10:11:11 -0700
> > > > Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> > > > 
> > > > > 
> > > > > On Jan 24, 2014, at 8:52, Jeff Layton <jlayton@redhat.com> wrote:
> > > > > 
> > > > > > On Wed, 22 Jan 2014 07:04:09 -0500
> > > > > > Jeff Layton <jlayton@redhat.com> wrote:
> > > > > > 
> > > > > >> On Wed, 22 Jan 2014 00:24:14 -0800
> > > > > >> Christoph Hellwig <hch@infradead.org> wrote:
> > > > > >> 
> > > > > >>> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > > > > >>>> In any case, this helps but it's a little odd. With this patch, you add
> > > > > >>>> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > > > > >>>> also left in the call to nfs_zap_mapping in the completion codepath.
> > > > > >>>> 
> > > > > >>>> So now, we shoot down the mapping prior to doing a DIO write, and then
> > > > > >>>> mark the mapping for invalidation again when the write completes. Was
> > > > > >>>> that intentional?
> > > > > >>>> 
> > > > > >>>> It seems a little excessive and might hurt performance in some cases.
> > > > > >>>> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > > > > >>>> this approach seems to give better cache coherency.
> > > > > >>> 
> > > > > >>> Thile follows the model implemented and documented in
> > > > > >>> generic_file_direct_write().
> > > > > >>> 
> > > > > >> 
> > > > > >> Ok, thanks. That makes sense, and the problem described in those
> > > > > >> comments is almost exactly the one I've seen in practice.
> > > > > >> 
> > > > > >> I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
> > > > > >> flag is handled, but that really has nothing to do with this patchset.
> > > > > >> 
> > > > > >> You can add my Tested-by to the set if you like...
> > > > > >> 
> > > > > > 
> > > > > > (re-sending with Trond's address fixed)
> > > > > > 
> > > > > > I may have spoken too soon...
> > > > > > 
> > > > > > This patchset didn't fix the problem once I cranked up the concurrency
> > > > > > from 100 child tasks to 1000. I think that HCH's patchset makes sense
> > > > > > and helps narrow the race window some, but the way that
> > > > > > nfs_revalidate_mapping/nfs_invalidate_mapping work is just racy.
> > > > > > 
> > > > > > The following patch does seem to fix it however. It's a combination of
> > > > > > a test patch that Trond gave me a while back and another change to
> > > > > > serialize the nfs_invalidate_mapping ops.
> > > > > > 
> > > > > > I think it's a reasonable approach to deal with the problem, but we
> > > > > > likely have some other areas that will need similar treatment since
> > > > > > they also check NFS_INO_INVALID_DATA: 
> > > > > > 
> > > > > >    nfs_write_pageuptodate
> > > > > >    nfs_readdir_search_for_cookie
> > > > > >    nfs_update_inode
> > > > > > 
> > > > > > Trond, thoughts? It's not quite ready for merge, but I'd like to get an
> > > > > > opinion on the basic approach, or whether you have an idea of how
> > > > > > better handle the races here:
> > > > > 
> > > > > I think that it is reasonable for nfs_revalidate_mapping, but I don’t see how it is relevant to nfs_update_inode or nfs_write_pageuptodate.
> > > > > Readdir already has its own locking at the VFS level, so we shouldn’t need to care there.
> > > > > 
> > > > 
> > > > 
> > > > nfs_write_pageuptodate does this:
> > > > 
> > > > ---------------8<-----------------
> > > >         if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
> > > >                 return false;
> > > > out:
> > > >         return PageUptodate(page) != 0;
> > > > ---------------8<-----------------
> > > > 
> > > > With the proposed patch, NFS_INO_INVALID_DATA would be cleared first and
> > > > only later would the page be invalidated. So, there's a race window in
> > > > there where the bit could be cleared but the page flag is still set,
> > > > even though it's on its way out the cache. So, I think we'd need to do
> > > > some similar sort of locking in there to make sure that doesn't happen.
> > > 
> > > We _cannot_ lock against nfs_revalidate_mapping() here, because we could
> > > end up deadlocking with invalidate_inode_pages2().
> > > 
> > > If you like, we could add a test for NFS_INO_INVALIDATING, to turn off
> > > the optimisation in that case, but I'd like to understand what the race
> > > would be: don't forget that the page is marked as PageUptodate(), which
> > > means that either invalidate_inode_pages2() has not yet reached this
> > > page, or that a read of the page succeeded after the invalidation was
> > > made.
> > > 
> > 
> > Right. The first situation seems wrong to me. We've marked the file as
> > INVALID and then cleared the bit to start the process of invalidating
> > the actual pages. It seems like nfs_write_pageuptodate ought not return
> > true even if PageUptodate() is still set at that point.
> > 
> > We could check NFS_INO_INVALIDATING, but we might miss that
> > optimization in a lot of cases just because something happens to be
> > in nfs_revalidate_mapping. Maybe that means that this bitlock isn't
> > sufficient and we need some other mechanism. I'm not sure what that
> > should be though.
> 
> Convert your patch to use wait_on_bit(), and then to call
> wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.
> 

I think that too would be racy...

We have to clear NFS_INO_INVALID_DATA while holding the i_lock, but we
can't wait_on_bit_lock() under that. So (pseudocode):

wait_on_bit
take i_lock
check and clear NFS_INO_INVALID_DATA
drop i_lock
wait_on_bit_lock

...so between dropping the i_lock and wait_on_bit_lock, we have a place
where another task could check the flag and find it clear.

I think the upshot here is that a bit_lock may not be the appropriate
thing to use to handle this. I'll have to ponder what might be better...

> > > > nfs_update_inode just does this:
> > > > 
> > > >         if (invalid & NFS_INO_INVALID_DATA)
> > > >                 nfs_fscache_invalidate(inode);
> > > > 
> > > > ...again, since we clear the bit first with this patch, I think we have
> > > > a potential race window there too. We might not see it set in a
> > > > situation where we would have before. That case is a bit more
> > > > problematic since we can't sleep to wait on the bitlock there.
> > > 
> > > Umm... That test in nfs_update_inode() is there because we might just
> > > have _set_ the NFS_INO_INVALID_DATA bit.
> > > 
> > 
> > Correct. But do we need to force a fscache invalidation at that point,
> > or can it wait until we're going to invalidate the mapping too?
> 
> That's a question for David. My assumption is that since invalidation is
> handled asynchronously by the fscache layer itself, that we need to let
> it start that process as soon as possible, but perhaps these races are
> an indication that we should actually do it at the time when we call
> invalidate_inode_pages2() (or at the latest, when we're evicting the
> inode from the icache)...
> 
> 

Ok, looks like it just sets a flag, so if we can handle this somehow
w/o sleeping then it may not matter. Again, I'll have to ponder what
may be better than a bit_lock.

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-24 21:21                               ` Jeff Layton
@ 2014-01-25  0:39                                 ` Trond Myklebust
  2014-01-25  0:54                                   ` Jeff Layton
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2014-01-25  0:39 UTC (permalink / raw)
  To: Layton Jeff; +Cc: Christoph Hellwig, linuxnfs


On Jan 24, 2014, at 14:21, Jeff Layton <jlayton@redhat.com> wrote:

> On Fri, 24 Jan 2014 11:46:41 -0700
> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
>> 
>> Convert your patch to use wait_on_bit(), and then to call
>> wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.
>> 
> 
> I think that too would be racy...
> 
> We have to clear NFS_INO_INVALID_DATA while holding the i_lock, but we
> can't wait_on_bit_lock() under that. So (pseudocode):
> 
> wait_on_bit
> take i_lock
> check and clear NFS_INO_INVALID_DATA
> drop i_lock
> wait_on_bit_lock
> 
> ...so between dropping the i_lock and wait_on_bit_lock, we have a place
> where another task could check the flag and find it clear.


	for(;;) {
		wait_on_bit(NFS_INO_INVALIDATING)
		/* Optimisation: don’t lock NFS_INO_INVALIDATING
		 * if NFS_INO_INVALID_DATA was cleared while we waited.
		 */
		if (!test_bit(NFS_INO_INVALID_DATA))
			return;
		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING))
			break;
	}
	spin_lock(inode->i_lock);
	if (!test_and_clear_bit(NFS_INO_INVALID_DATA)) {
		spin_unlock(inode->i_lock);
		goto out_raced;
	}
….
out_raced:
	clear_bit(NFS_INO_INVALIDATING)
	wake_up_bit(NFS_INO_INVALIDATING)


--
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-25  0:39                                 ` Trond Myklebust
@ 2014-01-25  0:54                                   ` Jeff Layton
  2014-01-25  1:05                                     ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Jeff Layton @ 2014-01-25  0:54 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Christoph Hellwig, linuxnfs

On Fri, 24 Jan 2014 17:39:45 -0700
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> 
> On Jan 24, 2014, at 14:21, Jeff Layton <jlayton@redhat.com> wrote:
> 
> > On Fri, 24 Jan 2014 11:46:41 -0700
> > Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> >> 
> >> Convert your patch to use wait_on_bit(), and then to call
> >> wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.
> >> 
> > 
> > I think that too would be racy...
> > 
> > We have to clear NFS_INO_INVALID_DATA while holding the i_lock, but we
> > can't wait_on_bit_lock() under that. So (pseudocode):
> > 
> > wait_on_bit
> > take i_lock
> > check and clear NFS_INO_INVALID_DATA
> > drop i_lock
> > wait_on_bit_lock
> > 
> > ...so between dropping the i_lock and wait_on_bit_lock, we have a place
> > where another task could check the flag and find it clear.
> 
> 
> 	for(;;) {
> 		wait_on_bit(NFS_INO_INVALIDATING)
> 		/* Optimisation: don’t lock NFS_INO_INVALIDATING
> 		 * if NFS_INO_INVALID_DATA was cleared while we waited.
> 		 */
> 		if (!test_bit(NFS_INO_INVALID_DATA))
> 			return;
> 		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING))
> 			break;
> 	}
> 	spin_lock(inode->i_lock);
> 	if (!test_and_clear_bit(NFS_INO_INVALID_DATA)) {
> 		spin_unlock(inode->i_lock);
> 		goto out_raced;
> 	}
> ….
> out_raced:
> 	clear_bit(NFS_INO_INVALIDATING)
> 	wake_up_bit(NFS_INO_INVALIDATING)
> 
> 
> --
> Trond Myklebust
> Linux NFS client maintainer
> 

Hmm maybe. OTOH, if we're using atomic bitops do we need to deal with
the spinlock?  I'll ponder it over the weekend and give it a harder
look on Monday.

Thanks for the thoughts so far...
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-25  0:54                                   ` Jeff Layton
@ 2014-01-25  1:05                                     ` Trond Myklebust
  2014-01-25  1:11                                       ` Trond Myklebust
  0 siblings, 1 reply; 37+ messages in thread
From: Trond Myklebust @ 2014-01-25  1:05 UTC (permalink / raw)
  To: Layton Jeff; +Cc: Christoph Hellwig, linuxnfs


On Jan 24, 2014, at 17:54, Jeff Layton <jlayton@redhat.com> wrote:

> On Fri, 24 Jan 2014 17:39:45 -0700
> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
> 
>> 
>> On Jan 24, 2014, at 14:21, Jeff Layton <jlayton@redhat.com> wrote:
>> 
>>> On Fri, 24 Jan 2014 11:46:41 -0700
>>> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
>>>> 
>>>> Convert your patch to use wait_on_bit(), and then to call
>>>> wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.
>>>> 
>>> 
>>> I think that too would be racy...
>>> 
>>> We have to clear NFS_INO_INVALID_DATA while holding the i_lock, but we
>>> can't wait_on_bit_lock() under that. So (pseudocode):
>>> 
>>> wait_on_bit
>>> take i_lock
>>> check and clear NFS_INO_INVALID_DATA
>>> drop i_lock
>>> wait_on_bit_lock
>>> 
>>> ...so between dropping the i_lock and wait_on_bit_lock, we have a place
>>> where another task could check the flag and find it clear.
>> 
>> 
>> 	for(;;) {
>> 		wait_on_bit(NFS_INO_INVALIDATING)
>> 		/* Optimisation: don’t lock NFS_INO_INVALIDATING
>> 		 * if NFS_INO_INVALID_DATA was cleared while we waited.
>> 		 */
>> 		if (!test_bit(NFS_INO_INVALID_DATA))
>> 			return;
>> 		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING))
>> 			break;
>> 	}
>> 	spin_lock(inode->i_lock);
>> 	if (!test_and_clear_bit(NFS_INO_INVALID_DATA)) {
>> 		spin_unlock(inode->i_lock);
>> 		goto out_raced;
>> 	}
>> ….
>> out_raced:
>> 	clear_bit(NFS_INO_INVALIDATING)
>> 	wake_up_bit(NFS_INO_INVALIDATING)
>> 
>> 
>> --
>> Trond Myklebust
>> Linux NFS client maintainer
>> 
> 
> Hmm maybe. OTOH, if we're using atomic bitops do we need to deal with
> the spinlock?  I'll ponder it over the weekend and give it a harder
> look on Monday.
> 

The NFS_I(inode)->cache_validity doesn’t use bitops, so the correct behaviour is to put NFS_INO_INVALIDATING inside NFS_I(inode)->flags (which is an atomic bit op field), and then continue to use the spin lock for NFS_INO_INVALID_DATA.

--
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 7/7] nfs: page cache invalidation for dio
  2014-01-25  1:05                                     ` Trond Myklebust
@ 2014-01-25  1:11                                       ` Trond Myklebust
  0 siblings, 0 replies; 37+ messages in thread
From: Trond Myklebust @ 2014-01-25  1:11 UTC (permalink / raw)
  To: Layton Jeff; +Cc: Christoph Hellwig, linuxnfs


On Jan 24, 2014, at 18:05, Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> 
> On Jan 24, 2014, at 17:54, Jeff Layton <jlayton@redhat.com> wrote:
> 
>> On Fri, 24 Jan 2014 17:39:45 -0700
>> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
>> 
>>> 
>>> On Jan 24, 2014, at 14:21, Jeff Layton <jlayton@redhat.com> wrote:
>>> 
>>>> On Fri, 24 Jan 2014 11:46:41 -0700
>>>> Trond Myklebust <trond.myklebust@primarydata.com> wrote:
>>>>> 
>>>>> Convert your patch to use wait_on_bit(), and then to call
>>>>> wait_on_bit_lock() if and only if you see NFS_INO_INVALID_DATA is set.
>>>>> 
>>>> 
>>>> I think that too would be racy...
>>>> 
>>>> We have to clear NFS_INO_INVALID_DATA while holding the i_lock, but we
>>>> can't wait_on_bit_lock() under that. So (pseudocode):
>>>> 
>>>> wait_on_bit
>>>> take i_lock
>>>> check and clear NFS_INO_INVALID_DATA
>>>> drop i_lock
>>>> wait_on_bit_lock
>>>> 
>>>> ...so between dropping the i_lock and wait_on_bit_lock, we have a place
>>>> where another task could check the flag and find it clear.
>>> 
>>> 
>>> 	for(;;) {
>>> 		wait_on_bit(NFS_INO_INVALIDATING)
>>> 		/* Optimisation: don’t lock NFS_INO_INVALIDATING
>>> 		 * if NFS_INO_INVALID_DATA was cleared while we waited.
>>> 		 */
>>> 		if (!test_bit(NFS_INO_INVALID_DATA))
>>> 			return;
>>> 		if (!test_and_set_bit_lock(NFS_INO_INVALIDATING))
>>> 			break;
>>> 	}
>>> 	spin_lock(inode->i_lock);
>>> 	if (!test_and_clear_bit(NFS_INO_INVALID_DATA)) {
>>> 		spin_unlock(inode->i_lock);
>>> 		goto out_raced;
>>> 	}
>>> ….
>>> out_raced:
>>> 	clear_bit(NFS_INO_INVALIDATING)
>>> 	wake_up_bit(NFS_INO_INVALIDATING)
>>> 
>>> 
>>> --
>>> Trond Myklebust
>>> Linux NFS client maintainer
>>> 
>> 
>> Hmm maybe. OTOH, if we're using atomic bitops do we need to deal with
>> the spinlock?  I'll ponder it over the weekend and give it a harder
>> look on Monday.
>> 
> 
> The NFS_I(inode)->cache_validity doesn’t use bitops, so the correct behaviour is to put NFS_INO_INVALIDATING inside NFS_I(inode)->flags (which is an atomic bit op field), and then continue to use the spin lock for NFS_INO_INVALID_DATA.

In other words please replace the atomic test_bit(NFS_INO_INVALID_DATA) and test_and_clear_bit(NFS_INO_INVALID_DATA) in the above pseudocode with the appropriate tests and clears of NFS_I(inode)->cache_validity.

--
Trond Myklebust
Linux NFS client maintainer


^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2014-01-25  1:11 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-14 16:50 [PATCH 0/7] direct I/O fixes Christoph Hellwig
2013-11-14 16:50 ` [PATCH 1/7] nfs: fix size updates for aio writes Christoph Hellwig
2013-11-14 16:50 ` [PATCH 2/7] nfs: defer inode_dio_done call until size update is done Christoph Hellwig
2013-11-14 16:50 ` [PATCH 3/7] nfs: increment i_dio_count for reads, too Christoph Hellwig
2013-11-14 16:50 ` [PATCH 4/7] nfs: merge nfs_direct_read into nfs_file_direct_read Christoph Hellwig
2013-11-14 16:50 ` [PATCH 5/7] nfs: merge nfs_direct_write into nfs_file_direct_write Christoph Hellwig
2013-11-14 16:50 ` [PATCH 6/7] nfs: take i_mutex during direct I/O reads Christoph Hellwig
2013-11-14 17:00   ` Chuck Lever
2013-11-15 14:29     ` Christoph Hellwig
2013-11-14 20:43   ` Trond Myklebust
2013-11-15 14:32     ` Christoph Hellwig
2013-11-15 15:23       ` Trond Myklebust
2013-11-15 15:25         ` Christoph Hellwig
2013-11-15 15:34           ` Trond Myklebust
2013-11-15 15:37             ` Christoph Hellwig
2013-11-15 16:00           ` Trond Myklebust
2013-11-14 16:50 ` [PATCH 7/7] nfs: page cache invalidation for dio Christoph Hellwig
2013-11-14 18:35   ` Jeff Layton
2013-11-15 14:28     ` Christoph Hellwig
2013-11-15 14:52       ` Jeff Layton
2013-11-15 15:02         ` Christoph Hellwig
2013-11-15 15:33           ` Jeff Layton
2014-01-21 19:21             ` Jeff Layton
2014-01-22  8:24               ` Christoph Hellwig
2014-01-22 12:04                 ` Jeff Layton
2014-01-24 15:50                   ` Jeff Layton
2014-01-24 15:52                   ` Jeff Layton
2014-01-24 17:11                     ` Trond Myklebust
2014-01-24 17:29                       ` Jeff Layton
2014-01-24 17:40                         ` Trond Myklebust
2014-01-24 18:00                           ` Jeff Layton
2014-01-24 18:46                             ` Trond Myklebust
2014-01-24 21:21                               ` Jeff Layton
2014-01-25  0:39                                 ` Trond Myklebust
2014-01-25  0:54                                   ` Jeff Layton
2014-01-25  1:05                                     ` Trond Myklebust
2014-01-25  1:11                                       ` Trond Myklebust

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).