From: Trond Myklebust <trond.myklebust@primarydata.com>
To: linux-nfs@vger.kernel.org
Subject: [PATCH v3 10/13] NFS: Do not serialise O_DIRECT reads and writes
Date: Wed, 22 Jun 2016 16:47:45 -0400 [thread overview]
Message-ID: <1466628468-3958-10-git-send-email-trond.myklebust@primarydata.com> (raw)
In-Reply-To: <1466628468-3958-9-git-send-email-trond.myklebust@primarydata.com>
Allow dio requests to be scheduled in parallel, but ensuring that they
do not conflict with buffered I/O.
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
fs/nfs/Makefile | 2 +-
fs/nfs/direct.c | 12 ++---
fs/nfs/file.c | 39 ++++++++++-----
fs/nfs/internal.h | 8 ++++
fs/nfs/io.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/nfs_fs.h | 1 +
6 files changed, 170 insertions(+), 20 deletions(-)
create mode 100644 fs/nfs/io.c
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a2..6abdda209642 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
- direct.o pagelist.o read.o symlink.o unlink.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fb659bb50678..2333e9973802 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -587,7 +587,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!count)
goto out;
- inode_lock(inode);
+ nfs_start_io_direct(inode);
result = nfs_sync_mapping(mapping);
if (result)
goto out_unlock;
@@ -615,7 +615,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -629,7 +629,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
out:
return result;
}
@@ -1013,7 +1013,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos = iocb->ki_pos;
end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
- inode_lock(inode);
+ nfs_start_io_direct(inode);
result = nfs_sync_mapping(mapping);
if (result)
@@ -1053,7 +1053,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos >> PAGE_SHIFT, end);
}
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -1076,7 +1076,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index df4dd8e7e62e..b5772e5a5961 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
+ nfs_end_io_read(inode);
return result;
}
EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+ nfs_start_io_read(inode);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
}
+ nfs_end_io_read(inode);
return res;
}
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -639,40 +643,49 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, count, (long long) iocb->ki_pos);
- result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
+
+ nfs_start_io_write(inode);
/*
* O_APPEND implies that we must revalidate the file length.
*/
if (iocb->ki_flags & IOCB_APPEND) {
result = nfs_revalidate_file_size(inode, file);
if (result)
- goto out;
+ goto out_unlock;
}
- result = count;
- if (!count)
+ result = generic_write_checks(iocb, from);
+ if (result <= 0)
+ goto out_unlock;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ result = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ nfs_end_io_write(inode);
+ if (result <= 0)
goto out;
- result = generic_file_write_iter(iocb, from);
- if (result > 0)
- written = result;
+ written = generic_write_sync(iocb, result);
+ iocb->ki_pos += written;
/* Return error values */
- if (result >= 0 && nfs_need_check_write(file, inode)) {
+ if (nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
}
- if (result > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out:
return result;
+out_unlock:
+ nfs_end_io_write(inode);
+ return result;
out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
- goto out;
+ return -EBUSY;
}
EXPORT_SYMBOL_GPL(nfs_file_write);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 0eb5c924886d..159b64ede82a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000000..013ccb67c106
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 120dd04b553c..225d17d35277 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -210,6 +210,7 @@ struct nfs_inode {
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
+#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{
--
2.7.4
next prev parent reply other threads:[~2016-06-22 20:48 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-06-22 20:47 [PATCH v3 01/13] NFS: Don't flush caches for a getattr that races with writeback Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 02/13] NFS: Cache access checks more aggressively Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 03/13] NFS: Cache aggressively when file is open for writing Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 04/13] NFS: Kill NFS_INO_NFS_INO_FLUSHING: it is a performance killer Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 05/13] NFS: writepage of a single page should not be synchronous Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 06/13] NFS: Don't hold the inode lock across fsync() Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 07/13] NFS: Don't call COMMIT in ->releasepage() Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 08/13] NFS: Fix O_DIRECT verifier problems Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 09/13] NFS: Ensure we reset the write verifier 'committed' value on resend Trond Myklebust
2016-06-22 20:47 ` Trond Myklebust [this message]
2016-06-22 20:47 ` [PATCH v3 11/13] NFS: Remove racy size manipulations in O_DIRECT Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 12/13] NFS: Remove inode->i_dio_count from the NFS O_DIRECT code Trond Myklebust
2016-06-22 20:47 ` [PATCH v3 13/13] NFS: Remove unused function nfs_revalidate_mapping_protected() Trond Myklebust
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1466628468-3958-10-git-send-email-trond.myklebust@primarydata.com \
--to=trond.myklebust@primarydata.com \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).