linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Zach Brown <zach.brown@oracle.com>
To: linux-fsdevel@vger.kernel.org
Subject: [PATCH 5/8] dio: refactor __blockdev_direct_IO()
Date: Thu, 22 Oct 2009 13:25:54 -0700	[thread overview]
Message-ID: <1256243157-16667-6-git-send-email-zach.brown@oracle.com> (raw)
In-Reply-To: <1256243157-16667-5-git-send-email-zach.brown@oracle.com>

This patch moves code around so that __blockdev_direct_IO() becomes a function
which only iterates over its iovec argument while calling helper functions.
This wil let us add functions in the future which call the same helper
functions while iterating over other ways of specifying memory for the IO.

direct_io_worker() was only called once from __blockdev_direct_IO() but took
the iovec argument.  Instead of trying to pass down other ways of specifying
memory, we move its body up into __blockdev_direct_IO() before pulling the
initilization and teardown steps of this resulting huge function off into
helpers.  What's left is a small function that mostly concerns itself with
calling helpers on the iovec.

The dio_unaligned() helper is a little strange, but it reflects the flow of the
original code.  If we were to clean it up we'd run the risk of introducing
bugs.

Comments have not been updated to reflect the code change yet.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 fs/direct-io.c |  313 +++++++++++++++++++++++++++++--------------------------
 1 files changed, 165 insertions(+), 148 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87..0a2ba8e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -925,21 +925,19 @@ out:
 	return ret;
 }
 
-/*
- * Releases both i_mutex and i_alloc_sem
- */
-static ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	struct dio *dio)
+static struct dio *dio_prepare(int rw, struct kiocb *iocb, struct inode *inode,
+			       unsigned blkbits, loff_t offset, loff_t end,
+			       get_block_t get_block, dio_iodone_t end_io,
+			       int dio_lock_type)
 {
-	unsigned long user_addr; 
-	unsigned long flags;
-	int seg;
-	ssize_t ret = 0;
-	ssize_t ret2;
-	size_t bytes;
+	struct dio *dio;
+
+	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return NULL;
+
+	if (rw & WRITE)
+		rw = WRITE_ODIRECT;
 
 	dio->inode = inode;
 	dio->rw = rw;
@@ -947,6 +945,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->blkfactor = inode->i_blkbits - blkbits;
 	dio->block_in_file = offset >> blkbits;
 
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(dio->blkfactor))
+		dio->pages_in_io = 2;
+
 	dio->get_block = get_block;
 	dio->end_io = end_io;
 	dio->final_block_in_bio = -1;
@@ -958,54 +963,88 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
+	dio->lock_type = dio_lock_type;
+
 	/*
-	 * In case of non-aligned buffers, we may need 2 more
-	 * pages since we need to zero out first and last block.
+	 * For file extending writes updating i_size before data
+	 * writeouts complete can expose uninitialized blocks. So
+	 * even for AIO, we need to wait for i/o to complete before
+	 * returning in this case.
 	 */
-	if (unlikely(dio->blkfactor))
-		dio->pages_in_io = 2;
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
+		(end > i_size_read(inode)));
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->pages_in_io +=
-			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
-				- user_addr/PAGE_SIZE);
+	return dio;
+}
+
+/*
+ * dio will work in inode blocksize until the time that it sees IO that isn't
+ * aligned to the inode blocksize.  From that point on all IO works in
+ * the block device's block size.
+ */
+int dio_unaligned(unsigned long val, unsigned *blkbits, 
+		  struct block_device *bdev)
+{
+	unsigned mask = (1 << *blkbits) - 1;
+
+	if (val & mask) {
+		if (bdev)
+			*blkbits = blksize_bits(bdev_logical_block_size(bdev));
+		mask = (1 << *blkbits) - 1;
+		return !!(val & mask);
 	}
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->size += bytes = iov[seg].iov_len;
+	return 0;
+}
 
-		/* Index into the first page of the first block */
-		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		dio->final_block_in_request = dio->block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		dio->head = 0;
-		dio->tail = 0;
-		dio->curr_page = 0;
+/*
+ * For block device access DIO_NO_LOCKING is used,
+ *	neither readers nor writers do any locking at all
+ * For regular files using DIO_LOCKING,
+ *	readers need to grab i_mutex and i_alloc_sem
+ *	writers need to grab i_alloc_sem only (i_mutex is already held)
+ * For regular files using DIO_OWN_LOCKING,
+ *	neither readers nor writers take any locks here
+ */
+static int dio_lock_and_flush(int rw, struct inode *inode, int dio_lock_type,
+		              loff_t offset, loff_t end)
+{
+	int ret;
 
-		dio->total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			dio->total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio->curr_user_address = user_addr;
-	
-		ret = do_direct_IO(dio);
+	if (dio_lock_type == DIO_NO_LOCKING)
+		return 0;
 
-		dio->result += iov[seg].iov_len -
-			((dio->final_block_in_request - dio->block_in_file) <<
-					blkbits);
+	/* watch out for a 0 len io from a tricksy fs */
+	if (rw == READ && end > offset) {
+		if (dio_lock_type != DIO_OWN_LOCKING)
+			mutex_lock(&inode->i_mutex);
 
+		ret = filemap_write_and_wait_range(inode->i_mapping, offset,
+						   end - 1);
 		if (ret) {
-			dio_cleanup(dio);
-			break;
+			if (dio_lock_type != DIO_OWN_LOCKING)
+				mutex_unlock(&inode->i_mutex);
+			return ret;
 		}
-	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && (rw & WRITE)) {
+		if (dio_lock_type == DIO_OWN_LOCKING)
+			mutex_unlock(&inode->i_mutex);
+	}
+
+	if (dio_lock_type == DIO_LOCKING)
+		/* lockdep: not the owner will release it */
+		down_read_non_owner(&inode->i_alloc_sem);
+
+	return 0;
+}
+
+static ssize_t dio_post_submission(struct dio *dio, loff_t offset, loff_t end,
+				   int ret)
+{
+	unsigned long flags;
+	int ret2;
+
+	if (ret == -ENOTBLK && (dio->rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1029,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		dio_bio_submit(dio);
 
 	/* All IO is now issued, send it on its way */
-	blk_run_address_space(inode->i_mapping);
+	blk_run_address_space(dio->inode->i_mapping);
 
 	/*
 	 * It is possible that, we return short IO due to end of file.
@@ -1042,7 +1081,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
-	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+	if ((dio->rw == READ) && (dio->lock_type == DIO_LOCKING))
 		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
@@ -1054,7 +1093,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == dio->size)))
+	    ((dio->rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
 	if (ret != -EIOCBQUEUED)
@@ -1081,6 +1120,22 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
 
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+	 * it's own meaner.
+	 */
+	if (unlikely(ret < 0 && (dio->rw & WRITE))) {
+		loff_t isize = i_size_read(dio->inode);
+
+		if (end > isize && dio->lock_type == DIO_LOCKING)
+			vmtruncate(dio->inode, isize);
+	}
+
+	if (dio->lock_type == DIO_OWN_LOCKING && dio->rw == READ)
+		mutex_lock(&dio->inode->i_mutex);
+
 	return ret;
 }
 
@@ -1112,122 +1167,84 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int dio_lock_type)
 {
 	int seg;
-	size_t size;
-	unsigned long addr;
+	size_t bytes;
+	unsigned long user_addr;
 	unsigned blkbits = inode->i_blkbits;
-	unsigned bdev_blkbits = 0;
-	unsigned blocksize_mask = (1 << blkbits) - 1;
-	ssize_t retval = -EINVAL;
+	ssize_t ret;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
-	int acquire_i_mutex = 0;
-
-	if (rw & WRITE)
-		rw = WRITE_ODIRECT;
-
-	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
-	if (offset & blocksize_mask) {
-		if (bdev)
-			 blkbits = bdev_blkbits;
-		blocksize_mask = (1 << blkbits) - 1;
-		if (offset & blocksize_mask)
-			goto out;
+	if (dio_unaligned(offset, &blkbits, bdev)) {
+		ret = -EINVAL;
+		goto out;
 	}
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
 	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
-			if (bdev)
-				 blkbits = bdev_blkbits;
-			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))  
-				goto out;
+		user_addr = (unsigned long)iov[seg].iov_base;
+		bytes = iov[seg].iov_len;
+		end += bytes;
+		if (dio_unaligned(user_addr|bytes, &blkbits, bdev)) {
+			ret = -EINVAL;
+			goto out;
 		}
 	}
 
-	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
-	retval = -ENOMEM;
-	if (!dio)
+	dio = dio_prepare(rw, iocb, inode, blkbits, offset, end, get_block,
+			  end_io, dio_lock_type);
+	if (!dio) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
-	/*
-	 * For block device access DIO_NO_LOCKING is used,
-	 *	neither readers nor writers do any locking at all
-	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_mutex and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
-	 * For regular files using DIO_OWN_LOCKING,
-	 *	neither readers nor writers take any locks here
-	 */
-	dio->lock_type = dio_lock_type;
-	if (dio_lock_type != DIO_NO_LOCKING) {
-		/* watch out for a 0 len io from a tricksy fs */
-		if (rw == READ && end > offset) {
-			struct address_space *mapping;
-
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
-
-			retval = filemap_write_and_wait_range(mapping, offset,
-							      end - 1);
-			if (retval) {
-				kfree(dio);
-				goto out;
-			}
-
-			if (dio_lock_type == DIO_OWN_LOCKING) {
-				mutex_unlock(&inode->i_mutex);
-				acquire_i_mutex = 1;
-			}
-		}
+	ret = dio_lock_and_flush(rw, inode, dio_lock_type, offset, end);
+	if (ret) {
+		kfree(dio);
+		goto out;
+	}
 
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->pages_in_io +=
+			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
+				- user_addr/PAGE_SIZE);
 	}
 
-	/*
-	 * For file extending writes updating i_size before data
-	 * writeouts complete can expose uninitialized blocks. So
-	 * even for AIO, we need to wait for i/o to complete before
-	 * returning in this case.
-	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
-		(end > i_size_read(inode)));
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->size += bytes = iov[seg].iov_len;
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io, dio);
+		/* Index into the first page of the first block */
+		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
 
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
-	 * it's own meaner.
-	 */
-	if (unlikely(retval < 0 && (rw & WRITE))) {
-		loff_t isize = i_size_read(inode);
+		dio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			dio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
+	
+		ret = do_direct_IO(dio);
 
-		if (end > isize && dio_lock_type == DIO_LOCKING)
-			vmtruncate(inode, isize);
-	}
+		dio->result += iov[seg].iov_len -
+			((dio->final_block_in_request - dio->block_in_file) <<
+					blkbits);
 
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
+		if (ret) {
+			dio_cleanup(dio);
+			break;
+		}
+	}
 
+	ret = dio_post_submission(dio, offset, end, ret);
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
-		mutex_lock(&inode->i_mutex);
-	return retval;
+	return ret;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
-- 
1.6.2.5


  reply	other threads:[~2009-10-22 20:47 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-10-22 20:25 [RFC] loop: issue aio with pages Zach Brown
2009-10-22 20:25 ` [PATCH 1/8] gadgetfs: use schedule_work() instead of EIOCBRETRY Zach Brown
2009-10-22 20:25   ` [PATCH 2/8] aio: disable retry Zach Brown
2009-10-22 20:25     ` [PATCH 3/8] aio: add an interface to submit aio from the kernel Zach Brown
2009-10-22 20:25       ` [PATCH 4/8] aio: add aio_read_pages and aio_write_pages Zach Brown
2009-10-22 20:25         ` Zach Brown [this message]
2009-10-22 20:25           ` [PATCH 6/8] dio: add an entry point which takes pages Zach Brown
2009-10-22 20:25             ` [PATCH 7/8] block: provide aio_read_pages and aio_write_pages Zach Brown
2009-10-22 20:25               ` [PATCH 8/8] loop: use aio to perform io on the underlying file Zach Brown
2009-10-27 16:01                 ` Jeff Moyer
2009-10-27 15:49             ` [PATCH 6/8] dio: add an entry point which takes pages Jeff Moyer
2009-10-27 17:50               ` Zach Brown
2009-10-27 15:39           ` [PATCH 5/8] dio: refactor __blockdev_direct_IO() Jeff Moyer
2009-10-26 16:17         ` [PATCH 4/8] aio: add aio_read_pages and aio_write_pages Jeff Moyer
2009-10-26 17:08           ` Jeff Moyer
2009-10-26 22:22             ` Zach Brown
2009-10-26 16:10       ` [PATCH 3/8] aio: add an interface to submit aio from the kernel Jeff Moyer
2009-10-26 22:21         ` Zach Brown
2009-10-25  7:37     ` [PATCH 2/8] aio: disable retry Christoph Hellwig
2009-10-26 22:15       ` Zach Brown
2009-10-26 16:00     ` Jeff Moyer
2009-10-26 15:57   ` [PATCH 1/8] gadgetfs: use schedule_work() instead of EIOCBRETRY Jeff Moyer
2009-10-25  7:36 ` [RFC] loop: issue aio with pages Christoph Hellwig
2009-10-26 22:13   ` Zach Brown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1256243157-16667-6-git-send-email-zach.brown@oracle.com \
    --to=zach.brown@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).