linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 22/22] nfs: add support for read_iter, write_iter
       [not found] <1330377576-3659-1-git-send-email-dave.kleikamp@oracle.com>
@ 2012-02-27 21:19 ` Dave Kleikamp
  2012-02-27 22:08   ` Myklebust, Trond
  0 siblings, 1 reply; 3+ messages in thread
From: Dave Kleikamp @ 2012-02-27 21:19 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, Zach Brown, Dave Kleikamp, Trond Myklebust,
	linux-nfs

This patch implements the read_iter and write_iter file operations which
allow kernel code to initiate directIO. This allows the loop device to
read and write directly to the server, bypassing the page cache.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Zach Brown <zab@zabbo.net>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: linux-nfs@vger.kernel.org
---
 fs/nfs/direct.c        |  508 +++++++++++++++++++++++++++++++++++++++---------
 fs/nfs/file.c          |   80 ++++++++
 include/linux/nfs_fs.h |    4 +
 3 files changed, 497 insertions(+), 95 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1940f1a..fc2c5c3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -46,6 +46,7 @@
 #include <linux/kref.h>
 #include <linux/slab.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/bio.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -87,6 +88,7 @@ struct nfs_direct_req {
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
+#define NFS_ODIRECT_MARK_DIRTY		(4)	/* mark read pages dirty */
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
@@ -253,9 +255,10 @@ static void nfs_direct_read_release(void *calldata)
 	} else {
 		dreq->count += data->res.count;
 		spin_unlock(&dreq->lock);
-		nfs_direct_dirty_pages(data->pagevec,
-				data->args.pgbase,
-				data->res.count);
+		if (dreq->flags & NFS_ODIRECT_MARK_DIRTY)
+			nfs_direct_dirty_pages(data->pagevec,
+					       data->args.pgbase,
+					       data->res.count);
 	}
 	nfs_direct_release_pages(data->pagevec, data->npages);
 
@@ -273,21 +276,15 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 };
 
 /*
- * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
- * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
- * bail and stop sending more reads.  Read length accounting is
- * handled automatically by nfs_direct_read_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * upon entry, data->pagevec contains pinned pages
  */
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
-						const struct iovec *iov,
-						loff_t pos)
+static ssize_t nfs_direct_read_schedule_helper(struct nfs_direct_req *dreq,
+					       struct nfs_read_data *data,
+					       size_t addr, size_t count,
+					       loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
-	unsigned long user_addr = (unsigned long)iov->iov_base;
-	size_t count = iov->iov_len;
-	size_t rsize = NFS_SERVER(inode)->rsize;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = ctx->cred,
@@ -299,6 +296,61 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
+	unsigned int pgbase = addr & ~PAGE_MASK;
+
+	get_dreq(dreq);
+
+	data->req = (struct nfs_page *) dreq;
+	data->inode = inode;
+	data->cred = msg.rpc_cred;
+	data->args.fh = NFS_FH(inode);
+	data->args.context = ctx;
+	data->args.lock_context = dreq->l_ctx;
+	data->args.offset = pos;
+	data->args.pgbase = pgbase;
+	data->args.pages = data->pagevec;
+	data->args.count = count;
+	data->res.fattr = &data->fattr;
+	data->res.eof = 0;
+	data->res.count = count;
+	nfs_fattr_init(&data->fattr);
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+
+	task_setup_data.task = &data->task;
+	task_setup_data.callback_data = data;
+	NFS_PROTO(inode)->read_setup(data, &msg);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+
+	dprintk("NFS: %5u initiated direct read call "
+		"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+		data->task.tk_pid, inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode), count,
+		(unsigned long long)data->args.offset);
+
+	return count;
+}
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads.  Read length accounting is
+ * handled automatically by nfs_direct_read_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+						const struct iovec *iov,
+						loff_t pos)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
+	size_t rsize = NFS_SERVER(inode)->rsize;
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
@@ -334,41 +386,10 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 			data->npages = result;
 		}
 
-		get_dreq(dreq);
-
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
-		data->args.count = bytes;
-		data->res.fattr = &data->fattr;
-		data->res.eof = 0;
-		data->res.count = bytes;
-		nfs_fattr_init(&data->fattr);
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		NFS_PROTO(inode)->read_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
+		bytes = nfs_direct_read_schedule_helper(dreq, data, user_addr,
+							 bytes, pos);
+		if (bytes < 0)
 			break;
-		rpc_put_task(task);
-
-		dprintk("NFS: %5u initiated direct read call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
 
 		started += bytes;
 		user_addr += bytes;
@@ -440,6 +461,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
+	dreq->flags = NFS_ODIRECT_MARK_DIRTY;
 
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
@@ -450,6 +472,90 @@ out:
 	return result;
 }
 
+static ssize_t nfs_direct_read_schedule_bvec(struct nfs_direct_req *dreq,
+					     struct bio_vec *bvec,
+					     unsigned long nr_segs,
+					     loff_t pos)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->dentry->d_inode;
+	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct nfs_read_data *data;
+	ssize_t result = 0;
+	size_t requested_bytes = 0;
+	int seg;
+	size_t addr;
+	size_t count;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		data = nfs_readdata_alloc(1);
+		if (unlikely(!data)) {
+			result = -ENOMEM;
+			break;
+		}
+		page_cache_get(bvec[seg].bv_page);
+		data->pagevec[0] = bvec[seg].bv_page;
+		addr = bvec[seg].bv_offset;
+		count = bvec[seg].bv_len;
+		do {
+			size_t bytes = min(rsize, count);
+			result = nfs_direct_read_schedule_helper(dreq, data,
+								 addr, bytes,
+								 pos);
+			if (result < 0)
+				goto out;
+
+			requested_bytes += bytes;
+			addr += bytes;
+			pos += bytes;
+			count -= bytes;
+		} while (count);
+	}
+out:
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
+	return 0;
+}
+
+static ssize_t nfs_direct_read_bvec(struct kiocb *iocb, struct bio_vec *bvec,
+				    unsigned long nr_segs, loff_t pos)
+{
+	ssize_t result = -ENOMEM;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct nfs_direct_req *dreq;
+
+	dreq = nfs_direct_req_alloc();
+	if (dreq == NULL)
+		goto out;
+
+	dreq->inode = inode;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_read_schedule_bvec(dreq, bvec, nr_segs, pos);
+	if (!result)
+		result = nfs_direct_wait(dreq);
+out_release:
+	nfs_direct_req_release(dreq);
+out:
+	return result;
+}
+
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
 	while (!list_empty(&dreq->rewrite_list)) {
@@ -704,20 +810,15 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 };
 
 /*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes.  Write length accounting is
- * handled automatically by nfs_direct_write_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * upon entry, data->pagevec contains pinned pages
  */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
-						 const struct iovec *iov,
-						 loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule_helper(struct nfs_direct_req *dreq,
+						struct nfs_write_data *data,
+						size_t addr, size_t count,
+						loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
-	unsigned long user_addr = (unsigned long)iov->iov_base;
-	size_t count = iov->iov_len;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = ctx->cred,
@@ -729,6 +830,63 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
+	unsigned int pgbase = addr & ~PAGE_MASK;
+
+	get_dreq(dreq);
+
+	list_move_tail(&data->pages, &dreq->rewrite_list);
+
+	data->req = (struct nfs_page *) dreq;
+	data->inode = inode;
+	data->cred = msg.rpc_cred;
+	data->args.fh = NFS_FH(inode);
+	data->args.context = ctx;
+	data->args.lock_context = dreq->l_ctx;
+	data->args.offset = pos;
+	data->args.pgbase = pgbase;
+	data->args.pages = data->pagevec;
+	data->args.count = count;
+	data->args.stable = sync;
+	data->res.fattr = &data->fattr;
+	data->res.count = count;
+	data->res.verf = &data->verf;
+	nfs_fattr_init(&data->fattr);
+
+	task_setup_data.task = &data->task;
+	task_setup_data.callback_data = data;
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	NFS_PROTO(inode)->write_setup(data, &msg);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+
+	dprintk("NFS: %5u initiated direct write call "
+		"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+		data->task.tk_pid, inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode), count,
+		(unsigned long long)data->args.offset);
+
+	return count;
+}
+
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes.  Write length accounting is
+ * handled automatically by nfs_direct_write_result().  Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+						 const struct iovec *iov,
+						 loff_t pos, int sync)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
@@ -765,44 +923,10 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 			data->npages = result;
 		}
 
-		get_dreq(dreq);
-
-		list_move_tail(&data->pages, &dreq->rewrite_list);
-
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
-		data->args.count = bytes;
-		data->args.stable = sync;
-		data->res.fattr = &data->fattr;
-		data->res.count = bytes;
-		data->res.verf = &data->verf;
-		nfs_fattr_init(&data->fattr);
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-		NFS_PROTO(inode)->write_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
+		result = nfs_direct_write_schedule_helper(dreq, data, user_addr,
+							  bytes, pos, sync);
+		if (result < 0)
 			break;
-		rpc_put_task(task);
-
-		dprintk("NFS: %5u initiated direct write call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
 
 		started += bytes;
 		user_addr += bytes;
@@ -858,6 +982,98 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	return 0;
 }
 
+static ssize_t nfs_direct_write_schedule_bvec(struct nfs_direct_req *dreq,
+					      struct bio_vec *bvec,
+					      size_t nr_segs, loff_t pos,
+					      int sync)
+{
+	struct nfs_open_context *ctx = dreq->ctx;
+	struct inode *inode = ctx->dentry->d_inode;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	struct nfs_write_data *data;
+	ssize_t result = 0;
+	size_t requested_bytes = 0;
+	unsigned long seg;
+	size_t addr;
+	size_t count;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		data = nfs_writedata_alloc(1);
+		if (unlikely(!data)) {
+			result = -ENOMEM;
+			break;
+		}
+
+		page_cache_get(bvec[seg].bv_page);
+		data->pagevec[0] = bvec[seg].bv_page;
+		addr = bvec[seg].bv_offset;
+		count = bvec[seg].bv_len;
+		do {
+			size_t bytes = min(wsize, count);
+			result = nfs_direct_write_schedule_helper(dreq, data,
+								  addr, bytes,
+								  pos, sync);
+			if (result < 0)
+				goto out;
+
+			requested_bytes += bytes;
+			addr += bytes;
+			pos += bytes;
+			count -= bytes;
+		} while (count);
+	}
+out:
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
+	return 0;
+}
+
+static ssize_t nfs_direct_write_bvec(struct kiocb *iocb, struct bio_vec *bvec,
+				     unsigned long nr_segs, loff_t pos,
+				     size_t count)
+{
+	ssize_t result = -ENOMEM;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct nfs_direct_req *dreq;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	int sync = NFS_UNSTABLE;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		goto out;
+	nfs_alloc_commit_data(dreq);
+
+	if (dreq->commit_data == NULL || count <= wsize)
+		sync = NFS_FILE_SYNC;
+
+	dreq->inode = inode;
+	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
+	if (!is_sync_kiocb(iocb))
+		dreq->iocb = iocb;
+
+	result = nfs_direct_write_schedule_bvec(dreq, bvec, nr_segs, pos, sync);
+	if (!result)
+		result = nfs_direct_wait(dreq);
+out_release:
+	nfs_direct_req_release(dreq);
+out:
+	return result;
+}
+
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos,
 				size_t count)
@@ -948,6 +1164,53 @@ out:
 	return retval;
 }
 
+ssize_t nfs_file_direct_read_bvec(struct kiocb *iocb, struct bio_vec *bvec,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t retval = -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count;
+
+	count = bvec_length(bvec, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+	dfprintk(FILE, "NFS: direct read bvec(%s/%s, %zd@%Ld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		count, (long long) pos);
+
+	retval = 0;
+	if (!count)
+		goto out;
+
+	retval = nfs_sync_mapping(mapping);
+	if (retval)
+		goto out;
+
+	task_io_account_read(count);
+
+	retval = nfs_direct_read_bvec(iocb, bvec, nr_segs, pos);
+	if (retval > 0)
+		iocb->ki_pos = pos + retval;
+
+out:
+	return retval;
+}
+
+ssize_t nfs_file_direct_read_iter(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t pos)
+{
+	if (iov_iter_has_iovec(iter))
+		return nfs_file_direct_read(iocb, iov_iter_iovec(iter),
+					    iter->nr_segs, pos);
+	else if (iov_iter_has_bvec(iter))
+		return nfs_file_direct_read_bvec(iocb, iov_iter_bvec(iter),
+						 iter->nr_segs, pos);
+	else
+		BUG();
+}
+
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
@@ -1012,6 +1275,61 @@ out:
 	return retval;
 }
 
+ssize_t nfs_file_direct_write_bvec(struct kiocb *iocb, struct bio_vec *bvec,
+				   unsigned long nr_segs, loff_t pos)
+{
+	ssize_t retval = -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count;
+
+	count = bvec_length(bvec, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		count, (long long) pos);
+
+	retval = generic_write_checks(file, &pos, &count, 0);
+	if (retval)
+		goto out;
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
+
+	retval = nfs_sync_mapping(mapping);
+	if (retval)
+		goto out;
+
+	task_io_account_write(count);
+
+	retval = nfs_direct_write_bvec(iocb, bvec, nr_segs, pos, count);
+
+	if (retval > 0)
+		iocb->ki_pos = pos + retval;
+
+out:
+	return retval;
+}
+
+ssize_t nfs_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+				   loff_t pos)
+{
+	if (iov_iter_has_iovec(iter))
+		return nfs_file_direct_write(iocb, iov_iter_iovec(iter),
+					     iter->nr_segs, pos);
+	else if (iov_iter_has_bvec(iter))
+		return nfs_file_direct_write_bvec(iocb, iov_iter_bvec(iter),
+						  iter->nr_segs, pos);
+	else
+		BUG();
+}
+
 /**
  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
  *
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c43a452..6fdb674 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,6 +646,82 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 	return ret;
 }
 
+ssize_t nfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter,
+			   loff_t pos)
+{
+	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	ssize_t result;
+	size_t count = iov_iter_count(iter);
+
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return nfs_file_direct_read_iter(iocb, iter, pos);
+
+	dprintk("NFS: read_iter(%s/%s, %lu@%lu)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (unsigned long) pos);
+
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+	if (!result) {
+		result = generic_file_read_iter(iocb, iter, pos);
+		if (result > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+	}
+	return result;
+}
+
+ssize_t nfs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+			    loff_t pos)
+{
+	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	unsigned long written = 0;
+	ssize_t result;
+	size_t count = iov_iter_count(iter);
+
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return nfs_file_direct_write_iter(iocb, iter, pos);
+
+	dprintk("NFS: write_iter(%s/%s, %lu@%Ld)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		(unsigned long) count, (long long) pos);
+
+	result = -EBUSY;
+	if (IS_SWAPFILE(inode))
+		goto out_swapfile;
+	/*
+	 * O_APPEND implies that we must revalidate the file length.
+	 */
+	if (iocb->ki_filp->f_flags & O_APPEND) {
+		result = nfs_revalidate_file_size(inode, iocb->ki_filp);
+		if (result)
+			goto out;
+	}
+
+	result = count;
+	if (!count)
+		goto out;
+
+	result = generic_file_write_iter(iocb, iter, pos);
+	if (result > 0)
+		written = result;
+
+	/* Return error values for O_DSYNC and IS_SYNC() */
+	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+		int err = vfs_fsync(iocb->ki_filp, 0);
+		if (err < 0)
+			result = err;
+	}
+	if (result > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+out:
+	return result;
+
+out_swapfile:
+	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+	goto out;
+}
+
 static int
 do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -853,6 +929,8 @@ const struct file_operations nfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= nfs_file_read,
 	.aio_write	= nfs_file_write,
+	.read_iter	= nfs_file_read_iter,
+	.write_iter	= nfs_file_write_iter,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
@@ -884,6 +962,8 @@ const struct file_operations nfs4_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= nfs_file_read,
 	.aio_write	= nfs_file_write,
+	.read_iter	= nfs_file_read_iter,
+	.write_iter	= nfs_file_write_iter,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs4_file_open,
 	.flush		= nfs_file_flush,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8c29950..6bda672 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -459,6 +459,10 @@ extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
 extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
 			const struct iovec *iov, unsigned long nr_segs,
 			loff_t pos);
+extern ssize_t nfs_file_direct_read_iter(struct kiocb *iocb,
+					 struct iov_iter *iter, loff_t pos);
+extern ssize_t nfs_file_direct_write_iter(struct kiocb *iocb,
+					  struct iov_iter *iter, loff_t pos);
 
 /*
  * linux/fs/nfs/dir.c
-- 
1.7.9.2


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH 22/22] nfs: add support for read_iter, write_iter
  2012-02-27 21:19 ` [RFC PATCH 22/22] nfs: add support for read_iter, write_iter Dave Kleikamp
@ 2012-02-27 22:08   ` Myklebust, Trond
  2012-02-27 23:17     ` Dave Kleikamp
  0 siblings, 1 reply; 3+ messages in thread
From: Myklebust, Trond @ 2012-02-27 22:08 UTC (permalink / raw)
  To: Dave Kleikamp
  Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Zach Brown, linux-nfs@vger.kernel.org

T24gTW9uLCAyMDEyLTAyLTI3IGF0IDE1OjE5IC0wNjAwLCBEYXZlIEtsZWlrYW1wIHdyb3RlOg0K
PiBUaGlzIHBhdGNoIGltcGxlbWVudHMgdGhlIHJlYWRfaXRlciBhbmQgd3JpdGVfaXRlciBmaWxl
IG9wZXJhdGlvbnMgd2hpY2gNCj4gYWxsb3cga2VybmVsIGNvZGUgdG8gaW5pdGlhdGUgZGlyZWN0
SU8uIFRoaXMgYWxsb3dzIHRoZSBsb29wIGRldmljZSB0bw0KPiByZWFkIGFuZCB3cml0ZSBkaXJl
Y3RseSB0byB0aGUgc2VydmVyLCBieXBhc3NpbmcgdGhlIHBhZ2UgY2FjaGUuDQo+IA0KPiBTaWdu
ZWQtb2ZmLWJ5OiBEYXZlIEtsZWlrYW1wIDxkYXZlLmtsZWlrYW1wQG9yYWNsZS5jb20+DQo+IENj
OiBaYWNoIEJyb3duIDx6YWJAemFiYm8ubmV0Pg0KPiBDYzogVHJvbmQgTXlrbGVidXN0IDxUcm9u
ZC5NeWtsZWJ1c3RAbmV0YXBwLmNvbT4NCj4gQ2M6IGxpbnV4LW5mc0B2Z2VyLmtlcm5lbC5vcmcN
Cg0KUGVyZm9ybWFuY2UgaXMgZ29pbmcgdG8gYmUgYWJzb2x1dGVseSB0ZXJyaWJsZSBmb3IgT19E
SVJFQ1QgYnZlY3MgaWYgeW91DQpzZW5kIGp1c3Qgb25lIHBhZ2UgcGVyIFJQQyBjYWxsLiBXZSBh
cmUgd29ya2luZyBvbiBtZXJnaW5nIHRoZSBPX0RJUkVDVA0KYW5kIHBhZ2UgY2FjaGUgY29kZSBp
biBvcmRlciB0byBnaXZlIE9fRElSRUNUIHRoZSBhYmlsaXR5IHRvIGNvYWxlc2NlDQpyZXF1ZXN0
cyBhbmQgZG8gcE5GUywgYW5kIEknbSBob3BpbmcgdGhhdCBjb2RlIHdpbGwgYmUgYXZhaWxhYmxl
IHNvb24uDQoNCkluIHRoZSBtZWFudGltZSwgd291bGRuJ3QgaXQgYmUgcG9zc2libGUgdG8gYWRk
IGJhc2ljIGNvYWxlc2NpbmcgdG8NCm5mc19kaXJlY3RfcmVhZF9zY2hlZHVsZV9idmVjL25mc19k
aXJlY3Rfd3JpdGVfc2NoZWR1bGVfYnZlYyBtb3JlIG9yDQpsZXNzIGluIHRoZSBzYW1lIHdheSB0
aGF0IHdlIGRvIGZvciBtdWx0aS1wYWdlIGlvdmVjIHNlZ21lbnRzPw0KaS5lLiBpZiB0aGUgbmV4
dCBidmVjIGlzIGNvbnRpZ3VvdXMgd2l0aCB0aGUgcHJldmlvdXMsIGFuZCB0aGUgcmVzdWx0aW5n
DQpSUEMgcmVhZCBsZW5ndGggPCByc2l6ZSAvIHdyaXRlIGxlbmd0aCA8IHdzaXplLCB0aGVuIGFk
ZCBpdCB0byB0aGUgc2FtZQ0KUlBDIGNhbGwuDQoNCi0tIA0KVHJvbmQgTXlrbGVidXN0DQpMaW51
eCBORlMgY2xpZW50IG1haW50YWluZXINCg0KTmV0QXBwDQpUcm9uZC5NeWtsZWJ1c3RAbmV0YXBw
LmNvbQ0Kd3d3Lm5ldGFwcC5jb20NCg0K

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH 22/22] nfs: add support for read_iter, write_iter
  2012-02-27 22:08   ` Myklebust, Trond
@ 2012-02-27 23:17     ` Dave Kleikamp
  0 siblings, 0 replies; 3+ messages in thread
From: Dave Kleikamp @ 2012-02-27 23:17 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Zach Brown, linux-nfs@vger.kernel.org

On 02/27/2012 04:08 PM, Myklebust, Trond wrote:
> On Mon, 2012-02-27 at 15:19 -0600, Dave Kleikamp wrote:
>> This patch implements the read_iter and write_iter file operations which
>> allow kernel code to initiate directIO. This allows the loop device to
>> read and write directly to the server, bypassing the page cache.
>>
>> Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
>> Cc: Zach Brown <zab@zabbo.net>
>> Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
>> Cc: linux-nfs@vger.kernel.org
> 
> Performance is going to be absolutely terrible for O_DIRECT bvecs if you
> send just one page per RPC call. We are working on merging the O_DIRECT
> and page cache code in order to give O_DIRECT the ability to coalesce
> requests and do pNFS, and I'm hoping that code will be available soon.
> 
> In the meantime, wouldn't it be possible to add basic coalescing to
> nfs_direct_read_schedule_bvec/nfs_direct_write_schedule_bvec more or
> less in the same way that we do for multi-page iovec segments?
> i.e. if the next bvec is contiguous with the previous, and the resulting
> RPC read length < rsize / write length < wsize, then add it to the same
> RPC call.

I basically followed the example of what the block layer was doing, but
coalescing makes more sense for nfs. I'll rework it to do that.

Thanks,
Shaggy

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-02-27 23:18 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1330377576-3659-1-git-send-email-dave.kleikamp@oracle.com>
2012-02-27 21:19 ` [RFC PATCH 22/22] nfs: add support for read_iter, write_iter Dave Kleikamp
2012-02-27 22:08   ` Myklebust, Trond
2012-02-27 23:17     ` Dave Kleikamp

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).