linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dave Kleikamp <dave.kleikamp@oracle.com>
To: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Zach Brown <zab@zabbo.net>,
	Dave Kleikamp <dave.kleikamp@oracle.com>
Subject: [RFC PATCH v2 10/21] dio: add bio_vec support to __blockdev_direct_IO()
Date: Fri, 30 Mar 2012 10:43:37 -0500	[thread overview]
Message-ID: <1333122228-13633-11-git-send-email-dave.kleikamp@oracle.com> (raw)
In-Reply-To: <1333122228-13633-1-git-send-email-dave.kleikamp@oracle.com>

The trick here is to initialize the dio state so that do_direct_IO()
consumes the pages we provide and never tries to map user pages.  This
is done by making sure that final_block_in_request covers the page that
we set in the dio.  do_direct_IO() will return before running out of
pages.

The caller is responsible for dirtying these pages, if needed.  We add
an option to the dio struct that makes sure we only dirty pages when
we're operating on iovecs of user addresses.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Zach Brown <zab@zabbo.net>
---
 fs/direct-io.c |  185 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 133 insertions(+), 52 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b8bdfba..0883076 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -126,6 +126,7 @@ struct dio {
 	spinlock_t bio_lock;		/* protects BIO fields below */
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
+	int should_dirty;		/* should we mark read pages dirty? */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
@@ -420,7 +421,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
 	if (sdio->submit_io)
@@ -491,13 +492,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (!uptodate)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 			struct page *page = bvec[page_no].bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) &&
+			    dio->should_dirty)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1096,6 +1098,101 @@ static int dio_aligned(unsigned long offset, unsigned *blkbits,
 	return 1;
 }
 
+static ssize_t direct_IO_iovec(const struct iovec *iov, unsigned long nr_segs,
+			       struct dio *dio, struct dio_submit *sdio,
+			       unsigned blkbits, struct buffer_head *map_bh)
+{
+	size_t bytes;
+	ssize_t retval = 0;
+	int seg;
+	unsigned long user_addr;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->pages_in_io +=
+			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+				PAGE_SIZE - user_addr / PAGE_SIZE);
+	}
+
+	dio->should_dirty = 1;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		sdio->head = 0;
+		sdio->tail = 0;
+		sdio->curr_page = 0;
+
+		sdio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			sdio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->curr_user_address = user_addr;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += iov[seg].iov_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	} /* end iovec loop */
+
+	return retval;
+}
+
+static ssize_t direct_IO_bvec(struct bio_vec *bvec, unsigned long nr_segs,
+			      struct dio *dio, struct dio_submit *sdio,
+			      unsigned blkbits, struct buffer_head *map_bh)
+{
+	ssize_t retval = 0;
+	int seg;
+
+	sdio->pages_in_io = nr_segs;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		sdio->size += bvec[seg].bv_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = bvec[seg].bv_offset >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bvec[seg].bv_len  >> blkbits);
+		/* Page fetching state */
+		sdio->curr_page = 0;
+		page_cache_get(bvec[seg].bv_page);
+		dio->pages[0] = bvec[seg].bv_page;
+		sdio->head = 0;
+		sdio->tail = 1;
+
+		sdio->total_pages = 1;
+		sdio->curr_user_address = 0;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += bvec[seg].bv_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	}
+
+	return retval;
+}
+
 /*
  * This is a library function for use by filesystem drivers.
  *
@@ -1135,10 +1232,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
-	unsigned long user_addr;
-	size_t bytes;
 	struct buffer_head map_bh = { 0, };
-	const struct iovec *iov = iov_iter_iovec(iter);
 	unsigned long nr_segs = iter->nr_segs;
 
 	if (rw & WRITE)
@@ -1148,13 +1242,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		goto out;
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if (!dio_aligned(addr|size, &blkbits, bdev))
-			goto out;
-	}
+	if (iov_iter_has_iovec(iter)) {
+		const struct iovec *iov = iov_iter_iovec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = (unsigned long)iov[seg].iov_base;
+			size = iov[seg].iov_len;
+			end += size;
+			if (!dio_aligned(addr|size, &blkbits, bdev))
+				goto out;
+		}
+	} else if (iov_iter_has_bvec(iter)) {
+		/*
+		 * Is this necessary, or can we trust the in-kernel
+		 * caller? Can we replace this with
+		 *	end += iov_iter_count(iter); ?
+		 */
+		struct bio_vec *bvec = iov_iter_bvec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = bvec[seg].bv_offset;
+			size = bvec[seg].bv_len;
+			end += size;
+			if (!dio_aligned(addr|size, &blkbits, bdev))
+				goto out;
+		}
+	} else
+		BUG();
 
 	/* watch out for a 0 len io from a tricksy fs */
 	if (rw == READ && end == offset)
@@ -1231,45 +1345,12 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.pages_in_io +=
-			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
-				PAGE_SIZE - user_addr / PAGE_SIZE);
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.size += bytes = iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio.final_block_in_request = sdio.block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio.head = 0;
-		sdio.tail = 0;
-		sdio.curr_page = 0;
-
-		sdio.total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio.total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio.curr_user_address = user_addr;
-
-		retval = do_direct_IO(dio, &sdio, &map_bh);
-
-		dio->result += iov[seg].iov_len -
-			((sdio.final_block_in_request - sdio.block_in_file) <<
-					blkbits);
-
-		if (retval) {
-			dio_cleanup(dio, &sdio);
-			break;
-		}
-	} /* end iovec loop */
+	if (iov_iter_has_iovec(iter))
+		retval = direct_IO_iovec(iov_iter_iovec(iter), nr_segs, dio,
+					 &sdio, blkbits, &map_bh);
+	else
+		retval = direct_IO_bvec(iov_iter_bvec(iter), nr_segs, dio,
+					&sdio, blkbits, &map_bh);
 
 	if (retval == -ENOTBLK) {
 		/*
-- 
1.7.9.5


  parent reply	other threads:[~2012-03-30 15:44 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-30 15:43 [RFC PATCH v2 00/21] loop: Issue O_DIRECT aio using bio_vec Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 01/21] iov_iter: move into its own file Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 02/21] iov_iter: add copy_to_user support Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 04/21] iov_iter: hide iovec details behind ops function pointers Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 05/21] iov_iter: add bvec support Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 06/21] iov_iter: add a shorten call Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 07/21] iov_iter: let callers extract iovecs and bio_vecs Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 08/21] dio: create a dio_aligned() helper function Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 09/21] dio: Convert direct_IO to use iov_iter Dave Kleikamp
2012-03-30 15:43 ` Dave Kleikamp [this message]
2012-03-30 15:43 ` [RFC PATCH v2 11/21] fs: pull iov_iter use higher up the stack Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 12/21] aio: add aio_kernel_() interface Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 13/21] aio: add aio support for iov_iter arguments Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 14/21] bio: add bvec_length(), like iov_length() Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 15/21] loop: use aio to perform io on the underlying file Dave Kleikamp
2012-04-20 14:48   ` Maxim V. Patlasov
2012-04-20 15:09     ` Dave Kleikamp
2012-04-20 15:20       ` Jeff Moyer
2012-04-20 15:52         ` Zach Brown
2012-04-20 15:57           ` Dave Kleikamp
2012-04-20 16:14             ` Maxim V. Patlasov
2012-04-20 17:19               ` Dave Kleikamp
2012-04-20 17:37                 ` Maxim V. Patlasov
2012-04-20 16:35           ` Jeff Moyer
2012-04-20 17:48             ` Zach Brown
2012-04-20 16:14         ` Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 16/21] ext3: add support for .read_iter and .write_iter Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 17/21] ocfs2: add support for read_iter, write_iter, and direct_IO_bvec Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 18/21] ext4: add support for read_iter and write_iter Dave Kleikamp
2012-04-02 18:42   ` Ted Ts'o
2012-04-02 22:45     ` Dave Kleikamp
2012-04-03  0:11       ` Dave Kleikamp
     [not found] ` <1333122228-13633-1-git-send-email-dave.kleikamp-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2012-03-30 15:43   ` [RFC PATCH v2 03/21] fuse: convert fuse to use iov_iter_copy_[to|from]_user Dave Kleikamp
2012-03-30 15:43   ` [RFC PATCH v2 19/21] nfs: add support for read_iter, write_iter Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 20/21] btrfs: add support for read_iter and write_iter Dave Kleikamp
2012-03-30 15:43 ` [RFC PATCH v2 21/21] fs: add read_iter and write_iter to more file systems Dave Kleikamp

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1333122228-13633-11-git-send-email-dave.kleikamp@oracle.com \
    --to=dave.kleikamp@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).