linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dave Kleikamp <dave.kleikamp@oracle.com>
To: linux-kernel@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	"Maxim V. Patlasov" <mpatlasov@parallels.com>,
	Zach Brown <zab@zabbo.net>, Christoph Hellwig <hch@infradead.org>,
	Dave Kleikamp <dave.kleikamp@oracle.com>
Subject: [PATCH V9 12/33] dio: add bio_vec support to __blockdev_direct_IO()
Date: Wed, 16 Oct 2013 09:04:25 -0500	[thread overview]
Message-ID: <1381932286-14978-13-git-send-email-dave.kleikamp@oracle.com> (raw)
In-Reply-To: <1381932286-14978-1-git-send-email-dave.kleikamp@oracle.com>

The trick here is to initialize the dio state so that do_direct_IO()
consumes the pages we provide and never tries to map user pages.  This
is done by making sure that final_block_in_request covers the page that
we set in the dio.  do_direct_IO() will return before running out of
pages.

The caller is responsible for dirtying these pages, if needed.  We add
an option to the dio struct that makes sure we only dirty pages when
we're operating on iovecs of user addresses.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Zach Brown <zab@zabbo.net>
---
 fs/direct-io.c | 206 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 148 insertions(+), 58 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1cf8f17..a142314 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
 	spinlock_t bio_lock;		/* protects BIO fields below */
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
+	int should_dirty;		/* should we mark read pages dirty? */
 	bool defer_completion;		/* defer AIO completion to workqueue? */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
@@ -403,7 +404,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
 	if (sdio->submit_io)
@@ -474,13 +475,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (!uptodate)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		bio_for_each_segment_all(bvec, bio, i) {
 			struct page *page = bvec->bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) &&
+			    dio->should_dirty)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1081,6 +1083,101 @@ static inline int drop_refcount(struct dio *dio)
 	return ret2;
 }
 
+static ssize_t direct_IO_iovec(const struct iovec *iov, unsigned long nr_segs,
+			       struct dio *dio, struct dio_submit *sdio,
+			       unsigned blkbits, struct buffer_head *map_bh)
+{
+	size_t bytes;
+	ssize_t retval = 0;
+	int seg;
+	unsigned long user_addr;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->pages_in_io +=
+			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+				PAGE_SIZE - user_addr / PAGE_SIZE);
+	}
+
+	dio->should_dirty = 1;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		sdio->head = 0;
+		sdio->tail = 0;
+		sdio->curr_page = 0;
+
+		sdio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			sdio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->curr_user_address = user_addr;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += iov[seg].iov_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	} /* end iovec loop */
+
+	return retval;
+}
+
+static ssize_t direct_IO_bvec(struct bio_vec *bvec, unsigned long nr_segs,
+			      struct dio *dio, struct dio_submit *sdio,
+			      unsigned blkbits, struct buffer_head *map_bh)
+{
+	ssize_t retval = 0;
+	int seg;
+
+	sdio->pages_in_io += nr_segs;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		sdio->size += bvec[seg].bv_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = bvec[seg].bv_offset >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bvec[seg].bv_len  >> blkbits);
+		/* Page fetching state */
+		sdio->curr_page = 0;
+		page_cache_get(bvec[seg].bv_page);
+		dio->pages[0] = bvec[seg].bv_page;
+		sdio->head = 0;
+		sdio->tail = 1;
+
+		sdio->total_pages = 1;
+		sdio->curr_user_address = 0;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += bvec[seg].bv_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	}
+
+	return retval;
+}
+
 /*
  * This is a library function for use by filesystem drivers.
  *
@@ -1122,11 +1219,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
-	unsigned long user_addr;
-	size_t bytes;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
-	const struct iovec *iov = iov_iter_iovec(iter);
 	unsigned long nr_segs = iter->nr_segs;
 
 	if (rw & WRITE)
@@ -1146,20 +1240,49 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if (unlikely((addr & blocksize_mask) ||
-			     (size & blocksize_mask))) {
-			if (bdev)
-				blkbits = blksize_bits(
-					 bdev_logical_block_size(bdev));
-			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))
-				goto out;
+	if (iov_iter_has_iovec(iter)) {
+		const struct iovec *iov = iov_iter_iovec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = (unsigned long)iov[seg].iov_base;
+			size = iov[seg].iov_len;
+			end += size;
+			if (unlikely((addr & blocksize_mask) ||
+				     (size & blocksize_mask))) {
+				if (bdev)
+					blkbits = blksize_bits(
+						 bdev_logical_block_size(bdev));
+				blocksize_mask = (1 << blkbits) - 1;
+				if ((addr & blocksize_mask) ||
+				    (size & blocksize_mask))
+					goto out;
+			}
 		}
-	}
+	} else if (iov_iter_has_bvec(iter)) {
+		/*
+		 * Is this necessary, or can we trust the in-kernel
+		 * caller? Can we replace this with
+		 *	end += iov_iter_count(iter); ?
+		 */
+		struct bio_vec *bvec = iov_iter_bvec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = bvec[seg].bv_offset;
+			size = bvec[seg].bv_len;
+			end += size;
+			if (unlikely((addr & blocksize_mask) ||
+				     (size & blocksize_mask))) {
+				if (bdev)
+					blkbits = blksize_bits(
+						 bdev_logical_block_size(bdev));
+				blocksize_mask = (1 << blkbits) - 1;
+				if ((addr & blocksize_mask) ||
+				    (size & blocksize_mask))
+					goto out;
+			}
+		}
+	} else
+		BUG();
 
 	/* watch out for a 0 len io from a tricksy fs */
 	if (rw == READ && end == offset)
@@ -1253,47 +1376,14 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.pages_in_io +=
-			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
-				PAGE_SIZE - user_addr / PAGE_SIZE);
-	}
-
 	blk_start_plug(&plug);
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.size += bytes = iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio.final_block_in_request = sdio.block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio.head = 0;
-		sdio.tail = 0;
-		sdio.curr_page = 0;
-
-		sdio.total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio.total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio.curr_user_address = user_addr;
-
-		retval = do_direct_IO(dio, &sdio, &map_bh);
-
-		dio->result += iov[seg].iov_len -
-			((sdio.final_block_in_request - sdio.block_in_file) <<
-					blkbits);
-
-		if (retval) {
-			dio_cleanup(dio, &sdio);
-			break;
-		}
-	} /* end iovec loop */
+	if (iov_iter_has_iovec(iter))
+		retval = direct_IO_iovec(iov_iter_iovec(iter), nr_segs, dio,
+					 &sdio, blkbits, &map_bh);
+	else
+		retval = direct_IO_bvec(iov_iter_bvec(iter), nr_segs, dio,
+					&sdio, blkbits, &map_bh);
 
 	if (retval == -ENOTBLK) {
 		/*
-- 
1.8.4

  parent reply	other threads:[~2013-10-16 14:04 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-16 14:04 [PATCH V9 00/33] loop: Issue O_DIRECT aio using bio_vec Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 01/33] iov_iter: move into its own file Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 02/33] iov_iter: iov_iter_copy_from_user() should use non-atomic copy Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 03/33] iov_iter: add copy_to_user support Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 04/33] iov_iter: add __iovec_copy_to_user() Dave Kleikamp
     [not found] ` <1381932286-14978-1-git-send-email-dave.kleikamp-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2013-10-16 14:04   ` [PATCH V9 05/33] fuse: convert fuse to use iov_iter_copy_[to|from]_user Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 06/33] iov_iter: hide iovec details behind ops function pointers Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 07/33] iov_iter: ii_iovec_copy_to_user should pre-fault user pages Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 08/33] iov_iter: add bvec support Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 09/33] iov_iter: add a shorten call Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 10/33] iov_iter: let callers extract iovecs and bio_vecs Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 11/33] dio: Convert direct_IO to use iov_iter Dave Kleikamp
2013-10-16 14:04 ` Dave Kleikamp [this message]
2013-10-16 14:04 ` [PATCH V9 13/33] fs: pull iov_iter use higher up the stack Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 14/33] aio: add aio_kernel_() interface Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 15/33] aio: add aio support for iov_iter arguments Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 16/33] bio: add bvec_length(), like iov_length() Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 17/33] loop: use aio to perform io on the underlying file Dave Kleikamp
2013-10-18 17:55   ` [PATCH V9.1 " Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 18/33] fs: create file_readable() and file_writable() functions Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 19/33] fs: use read_iter and write_iter rather than aio_read and aio_write Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 20/33] fs: add read_iter and write_iter to several file systems Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 21/33] ocfs2: add support for read_iter and write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 22/33] ext4: " Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 23/33] nfs: add support for read_iter, write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 24/33] nfs: simplify swap Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 25/33] btrfs: add support for read_iter and write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 26/33] block_dev: add support for read_iter, write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 27/33] xfs: add support for read_iter and write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 28/33] gfs2: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 29/33] udf: convert file ops from aio_read/write " Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 30/33] afs: add support for read_iter and write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 31/33] ecrpytfs: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 32/33] ubifs: convert file ops from aio_read/write " Dave Kleikamp
2013-10-16 14:04 ` [PATCH V9 33/33] tmpfs: add support for read_iter and write_iter Dave Kleikamp

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1381932286-14978-13-git-send-email-dave.kleikamp@oracle.com \
    --to=dave.kleikamp@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=hch@infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mpatlasov@parallels.com \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).