From: Dave Kleikamp <dave.kleikamp@oracle.com>
To: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Zach Brown <zab@zabbo.net>,
"Maxim V. Patlasov" <mpatlasov@parallels.com>,
Dave Kleikamp <dave.kleikamp@oracle.com>
Subject: [PATCH 11/22] dio: add bio_vec support to __blockdev_direct_IO()
Date: Mon, 22 Oct 2012 10:15:11 -0500 [thread overview]
Message-ID: <1350918922-6096-12-git-send-email-dave.kleikamp@oracle.com> (raw)
In-Reply-To: <1350918922-6096-1-git-send-email-dave.kleikamp@oracle.com>
The trick here is to initialize the dio state so that do_direct_IO()
consumes the pages we provide and never tries to map user pages. This
is done by making sure that final_block_in_request covers the page that
we set in the dio. do_direct_IO() will return before running out of
pages.
The caller is responsible for dirtying these pages, if needed. We add
an option to the dio struct that makes sure we only dirty pages when
we're operating on iovecs of user addresses.
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Zach Brown <zab@zabbo.net>
---
fs/direct-io.c | 185 +++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 133 insertions(+), 52 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e2733e4..8417a3f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -126,6 +126,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ int should_dirty; /* should we mark read pages dirty? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -376,7 +377,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ)
+ if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
if (sdio->submit_io)
@@ -447,13 +448,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (!uptodate)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ) {
+ if (dio->is_async && dio->rw == READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
struct page *page = bvec[page_no].bv_page;
- if (dio->rw == READ && !PageCompound(page))
+ if (dio->rw == READ && !PageCompound(page) &&
+ dio->should_dirty)
set_page_dirty_lock(page);
page_cache_release(page);
}
@@ -1052,6 +1054,101 @@ static int dio_aligned(unsigned long offset, unsigned *blkbits,
return 1;
}
+static ssize_t direct_IO_iovec(const struct iovec *iov, unsigned long nr_segs,
+ struct dio *dio, struct dio_submit *sdio,
+ unsigned blkbits, struct buffer_head *map_bh)
+{
+ size_t bytes;
+ ssize_t retval = 0;
+ int seg;
+ unsigned long user_addr;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio->pages_in_io +=
+ ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+ PAGE_SIZE - user_addr / PAGE_SIZE);
+ }
+
+ dio->should_dirty = 1;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio->size += bytes = iov[seg].iov_len;
+
+ /* Index into the first page of the first block */
+ sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+ sdio->final_block_in_request = sdio->block_in_file +
+ (bytes >> blkbits);
+ /* Page fetching state */
+ sdio->head = 0;
+ sdio->tail = 0;
+ sdio->curr_page = 0;
+
+ sdio->total_pages = 0;
+ if (user_addr & (PAGE_SIZE-1)) {
+ sdio->total_pages++;
+ bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+ }
+ sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio->curr_user_address = user_addr;
+
+ retval = do_direct_IO(dio, sdio, map_bh);
+
+ dio->result += iov[seg].iov_len -
+ ((sdio->final_block_in_request - sdio->block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, sdio);
+ break;
+ }
+ } /* end iovec loop */
+
+ return retval;
+}
+
+static ssize_t direct_IO_bvec(struct bio_vec *bvec, unsigned long nr_segs,
+ struct dio *dio, struct dio_submit *sdio,
+ unsigned blkbits, struct buffer_head *map_bh)
+{
+ ssize_t retval = 0;
+ int seg;
+
+ sdio->pages_in_io += nr_segs;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ sdio->size += bvec[seg].bv_len;
+
+ /* Index into the first page of the first block */
+ sdio->first_block_in_page = bvec[seg].bv_offset >> blkbits;
+ sdio->final_block_in_request = sdio->block_in_file +
+ (bvec[seg].bv_len >> blkbits);
+ /* Page fetching state */
+ sdio->curr_page = 0;
+ page_cache_get(bvec[seg].bv_page);
+ dio->pages[0] = bvec[seg].bv_page;
+ sdio->head = 0;
+ sdio->tail = 1;
+
+ sdio->total_pages = 1;
+ sdio->curr_user_address = 0;
+
+ retval = do_direct_IO(dio, sdio, map_bh);
+
+ dio->result += bvec[seg].bv_len -
+ ((sdio->final_block_in_request - sdio->block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, sdio);
+ break;
+ }
+ }
+
+ return retval;
+}
+
/*
* This is a library function for use by filesystem drivers.
*
@@ -1091,11 +1188,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
loff_t end = offset;
struct dio *dio;
struct dio_submit sdio = { 0, };
- unsigned long user_addr;
- size_t bytes;
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
- const struct iovec *iov = iov_iter_iovec(iter);
unsigned long nr_segs = iter->nr_segs;
if (rw & WRITE)
@@ -1105,13 +1199,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
goto out;
/* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if (!dio_aligned(addr|size, &blkbits, bdev))
- goto out;
- }
+ if (iov_iter_has_iovec(iter)) {
+ const struct iovec *iov = iov_iter_iovec(iter);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if (!dio_aligned(addr|size, &blkbits, bdev))
+ goto out;
+ }
+ } else if (iov_iter_has_bvec(iter)) {
+ /*
+ * Is this necessary, or can we trust the in-kernel
+ * caller? Can we replace this with
+ * end += iov_iter_count(iter); ?
+ */
+ struct bio_vec *bvec = iov_iter_bvec(iter);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = bvec[seg].bv_offset;
+ size = bvec[seg].bv_len;
+ end += size;
+ if (!dio_aligned(addr|size, &blkbits, bdev))
+ goto out;
+ }
+ } else
+ BUG();
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end == offset)
@@ -1188,47 +1302,14 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (unlikely(sdio.blkfactor))
sdio.pages_in_io = 2;
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.pages_in_io +=
- ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
- PAGE_SIZE - user_addr / PAGE_SIZE);
- }
-
blk_start_plug(&plug);
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- sdio.final_block_in_request = sdio.block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- sdio.head = 0;
- sdio.tail = 0;
- sdio.curr_page = 0;
-
- sdio.total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- sdio.total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- sdio.curr_user_address = user_addr;
-
- retval = do_direct_IO(dio, &sdio, &map_bh);
-
- dio->result += iov[seg].iov_len -
- ((sdio.final_block_in_request - sdio.block_in_file) <<
- blkbits);
-
- if (retval) {
- dio_cleanup(dio, &sdio);
- break;
- }
- } /* end iovec loop */
+ if (iov_iter_has_iovec(iter))
+ retval = direct_IO_iovec(iov_iter_iovec(iter), nr_segs, dio,
+ &sdio, blkbits, &map_bh);
+ else
+ retval = direct_IO_bvec(iov_iter_bvec(iter), nr_segs, dio,
+ &sdio, blkbits, &map_bh);
if (retval == -ENOTBLK) {
/*
--
1.7.12.3
next prev parent reply other threads:[~2012-10-22 15:15 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-22 15:15 [PATCH v3 00/22] loop: Issue O_DIRECT aio using bio_vec Dave Kleikamp
2012-10-22 15:15 ` [PATCH 01/22] iov_iter: move into its own file Dave Kleikamp
2012-10-22 15:15 ` [PATCH 02/22] iov_iter: iov_iter_copy_from_user() should use non-atomic copy Dave Kleikamp
2012-10-22 15:15 ` [PATCH 03/22] iov_iter: add copy_to_user support Dave Kleikamp
[not found] ` <1350918922-6096-1-git-send-email-dave.kleikamp-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2012-10-22 15:15 ` [PATCH 04/22] fuse: convert fuse to use iov_iter_copy_[to|from]_user Dave Kleikamp
2012-10-22 15:15 ` [PATCH 05/22] iov_iter: hide iovec details behind ops function pointers Dave Kleikamp
2012-10-22 15:15 ` [PATCH 06/22] iov_iter: add bvec support Dave Kleikamp
2012-10-22 15:15 ` [PATCH 07/22] iov_iter: add a shorten call Dave Kleikamp
2012-10-22 15:15 ` [PATCH 08/22] iov_iter: let callers extract iovecs and bio_vecs Dave Kleikamp
2012-10-22 15:15 ` [PATCH 09/22] dio: create a dio_aligned() helper function Dave Kleikamp
2012-10-22 15:15 ` [PATCH 10/22] dio: Convert direct_IO to use iov_iter Dave Kleikamp
2012-10-22 15:15 ` Dave Kleikamp [this message]
2012-10-22 15:15 ` [PATCH 12/22] fs: pull iov_iter use higher up the stack Dave Kleikamp
2012-10-22 15:15 ` [PATCH 13/22] aio: add aio_kernel_() interface Dave Kleikamp
2012-10-22 15:15 ` [PATCH 14/22] aio: add aio support for iov_iter arguments Dave Kleikamp
2012-10-22 15:15 ` [PATCH 15/22] bio: add bvec_length(), like iov_length() Dave Kleikamp
2012-10-22 15:15 ` [PATCH 16/22] loop: use aio to perform io on the underlying file Dave Kleikamp
2012-10-22 15:15 ` [PATCH 17/22] fs: add read_iter and write_iter to several file systems Dave Kleikamp
2012-10-22 15:15 ` [PATCH 18/22] ocfs2: add support for read_iter, write_iter, and direct_IO_bvec Dave Kleikamp
2012-10-22 15:15 ` [PATCH 19/22] ext4: add support for read_iter and write_iter Dave Kleikamp
2012-10-22 15:15 ` [PATCH 20/22] nfs: add support for read_iter, write_iter Dave Kleikamp
2012-10-22 15:21 ` Myklebust, Trond
2012-10-22 15:35 ` Dave Kleikamp
2012-10-22 15:15 ` [PATCH 21/22] btrfs: add support for read_iter and write_iter Dave Kleikamp
2012-10-22 15:15 ` [PATCH 22/22] block_dev: add support for read_iter, write_iter Dave Kleikamp
2012-10-23 0:07 ` [PATCH v3 00/22] loop: Issue O_DIRECT aio using bio_vec Dave Chinner
2012-10-23 0:53 ` Dave Kleikamp
2012-10-23 1:59 ` Dave Chinner
2012-10-23 13:04 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1350918922-6096-12-git-send-email-dave.kleikamp@oracle.com \
--to=dave.kleikamp@oracle.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mpatlasov@parallels.com \
--cc=zab@zabbo.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).