From: Dave Kleikamp <dave.kleikamp@oracle.com>
To: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Zach Brown <zab@zabbo.net>,
"Maxim V. Patlasov" <mpatlasov@parallels.com>,
Dave Kleikamp <dave.kleikamp@oracle.com>
Subject: [PATCH V5 10/30] dio: add bio_vec support to __blockdev_direct_IO()
Date: Wed, 9 Jan 2013 13:58:25 -0600 [thread overview]
Message-ID: <1357761525-22718-11-git-send-email-dave.kleikamp@oracle.com> (raw)
In-Reply-To: <1357761525-22718-1-git-send-email-dave.kleikamp@oracle.com>
The trick here is to initialize the dio state so that do_direct_IO()
consumes the pages we provide and never tries to map user pages. This
is done by making sure that final_block_in_request covers the page that
we set in the dio. do_direct_IO() will return before running out of
pages.
The caller is responsible for dirtying these pages, if needed. We add
an option to the dio struct that makes sure we only dirty pages when
we're operating on iovecs of user addresses.
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Zach Brown <zab@zabbo.net>
---
fs/direct-io.c | 206 +++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 148 insertions(+), 58 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b97a202..63e5a17 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -126,6 +126,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ int should_dirty; /* should we mark read pages dirty? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -376,7 +377,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ)
+ if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
if (sdio->submit_io)
@@ -447,13 +448,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (!uptodate)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ) {
+ if (dio->is_async && dio->rw == READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
struct page *page = bvec[page_no].bv_page;
- if (dio->rw == READ && !PageCompound(page))
+ if (dio->rw == READ && !PageCompound(page) &&
+ dio->should_dirty)
set_page_dirty_lock(page);
page_cache_release(page);
}
@@ -1020,6 +1022,101 @@ static inline int drop_refcount(struct dio *dio)
return ret2;
}
+static ssize_t direct_IO_iovec(const struct iovec *iov, unsigned long nr_segs,
+ struct dio *dio, struct dio_submit *sdio,
+ unsigned blkbits, struct buffer_head *map_bh)
+{
+ size_t bytes;
+ ssize_t retval = 0;
+ int seg;
+ unsigned long user_addr;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio->pages_in_io +=
+ ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+ PAGE_SIZE - user_addr / PAGE_SIZE);
+ }
+
+ dio->should_dirty = 1;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ user_addr = (unsigned long)iov[seg].iov_base;
+ sdio->size += bytes = iov[seg].iov_len;
+
+ /* Index into the first page of the first block */
+ sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+ sdio->final_block_in_request = sdio->block_in_file +
+ (bytes >> blkbits);
+ /* Page fetching state */
+ sdio->head = 0;
+ sdio->tail = 0;
+ sdio->curr_page = 0;
+
+ sdio->total_pages = 0;
+ if (user_addr & (PAGE_SIZE-1)) {
+ sdio->total_pages++;
+ bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+ }
+ sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio->curr_user_address = user_addr;
+
+ retval = do_direct_IO(dio, sdio, map_bh);
+
+ dio->result += iov[seg].iov_len -
+ ((sdio->final_block_in_request - sdio->block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, sdio);
+ break;
+ }
+ } /* end iovec loop */
+
+ return retval;
+}
+
+static ssize_t direct_IO_bvec(struct bio_vec *bvec, unsigned long nr_segs,
+ struct dio *dio, struct dio_submit *sdio,
+ unsigned blkbits, struct buffer_head *map_bh)
+{
+ ssize_t retval = 0;
+ int seg;
+
+ sdio->pages_in_io += nr_segs;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ sdio->size += bvec[seg].bv_len;
+
+ /* Index into the first page of the first block */
+ sdio->first_block_in_page = bvec[seg].bv_offset >> blkbits;
+ sdio->final_block_in_request = sdio->block_in_file +
+ (bvec[seg].bv_len >> blkbits);
+ /* Page fetching state */
+ sdio->curr_page = 0;
+ page_cache_get(bvec[seg].bv_page);
+ dio->pages[0] = bvec[seg].bv_page;
+ sdio->head = 0;
+ sdio->tail = 1;
+
+ sdio->total_pages = 1;
+ sdio->curr_user_address = 0;
+
+ retval = do_direct_IO(dio, sdio, map_bh);
+
+ dio->result += bvec[seg].bv_len -
+ ((sdio->final_block_in_request - sdio->block_in_file) <<
+ blkbits);
+
+ if (retval) {
+ dio_cleanup(dio, sdio);
+ break;
+ }
+ }
+
+ return retval;
+}
+
/*
* This is a library function for use by filesystem drivers.
*
@@ -1061,11 +1158,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
loff_t end = offset;
struct dio *dio;
struct dio_submit sdio = { 0, };
- unsigned long user_addr;
- size_t bytes;
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
- const struct iovec *iov = iov_iter_iovec(iter);
unsigned long nr_segs = iter->nr_segs;
if (rw & WRITE)
@@ -1085,20 +1179,49 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if (unlikely((addr & blocksize_mask) ||
- (size & blocksize_mask))) {
- if (bdev)
- blkbits = blksize_bits(
- bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
+ if (iov_iter_has_iovec(iter)) {
+ const struct iovec *iov = iov_iter_iovec(iter);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if (unlikely((addr & blocksize_mask) ||
+ (size & blocksize_mask))) {
+ if (bdev)
+ blkbits = blksize_bits(
+ bdev_logical_block_size(bdev));
+ blocksize_mask = (1 << blkbits) - 1;
+ if ((addr & blocksize_mask) ||
+ (size & blocksize_mask))
+ goto out;
+ }
}
- }
+ } else if (iov_iter_has_bvec(iter)) {
+ /*
+ * Is this necessary, or can we trust the in-kernel
+ * caller? Can we replace this with
+ * end += iov_iter_count(iter); ?
+ */
+ struct bio_vec *bvec = iov_iter_bvec(iter);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = bvec[seg].bv_offset;
+ size = bvec[seg].bv_len;
+ end += size;
+ if (unlikely((addr & blocksize_mask) ||
+ (size & blocksize_mask))) {
+ if (bdev)
+ blkbits = blksize_bits(
+ bdev_logical_block_size(bdev));
+ blocksize_mask = (1 << blkbits) - 1;
+ if ((addr & blocksize_mask) ||
+ (size & blocksize_mask))
+ goto out;
+ }
+ }
+ } else
+ BUG();
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end == offset)
@@ -1175,47 +1298,14 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (unlikely(sdio.blkfactor))
sdio.pages_in_io = 2;
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.pages_in_io +=
- ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
- PAGE_SIZE - user_addr / PAGE_SIZE);
- }
-
blk_start_plug(&plug);
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- sdio.final_block_in_request = sdio.block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- sdio.head = 0;
- sdio.tail = 0;
- sdio.curr_page = 0;
-
- sdio.total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- sdio.total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- sdio.curr_user_address = user_addr;
-
- retval = do_direct_IO(dio, &sdio, &map_bh);
-
- dio->result += iov[seg].iov_len -
- ((sdio.final_block_in_request - sdio.block_in_file) <<
- blkbits);
-
- if (retval) {
- dio_cleanup(dio, &sdio);
- break;
- }
- } /* end iovec loop */
+ if (iov_iter_has_iovec(iter))
+ retval = direct_IO_iovec(iov_iter_iovec(iter), nr_segs, dio,
+ &sdio, blkbits, &map_bh);
+ else
+ retval = direct_IO_bvec(iov_iter_bvec(iter), nr_segs, dio,
+ &sdio, blkbits, &map_bh);
if (retval == -ENOTBLK) {
/*
--
1.8.1
next prev parent reply other threads:[~2013-01-09 19:58 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-01-09 19:58 [PATCH V5 00/30] loop: Issue O_DIRECT aio using bio_vec Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 01/30] iov_iter: move into its own file Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 02/30] iov_iter: iov_iter_copy_from_user() should use non-atomic copy Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 03/30] iov_iter: add copy_to_user support Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 05/30] iov_iter: hide iovec details behind ops function pointers Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 06/30] iov_iter: add bvec support Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 07/30] iov_iter: add a shorten call Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 08/30] iov_iter: let callers extract iovecs and bio_vecs Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 09/30] dio: Convert direct_IO to use iov_iter Dave Kleikamp
2013-01-09 19:58 ` Dave Kleikamp [this message]
2013-01-09 19:58 ` [PATCH V5 11/30] fs: pull iov_iter use higher up the stack Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 12/30] aio: add aio_kernel_() interface Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 13/30] aio: add aio support for iov_iter arguments Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 14/30] bio: add bvec_length(), like iov_length() Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 15/30] loop: use aio to perform io on the underlying file Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 16/30] fs: create file_readable() and file_writable() functions Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 17/30] fs: use read_iter and write_iter rather than aio_read and aio_write Dave Kleikamp
2013-01-18 21:26 ` Jeff Moyer
2013-01-18 22:03 ` Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 18/30] fs: add read_iter and write_iter to several file systems Dave Kleikamp
2013-01-10 12:40 ` Boaz Harrosh
2013-01-09 19:58 ` [PATCH V5 19/30] ocfs2: add support for read_iter, write_iter, and direct_IO_bvec Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 20/30] ext4: add support for read_iter and write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 21/30] nfs: add support for read_iter, write_iter Dave Kleikamp
[not found] ` <1357761525-22718-1-git-send-email-dave.kleikamp-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2013-01-09 19:58 ` [PATCH V5 04/30] fuse: convert fuse to use iov_iter_copy_[to|from]_user Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 22/30] nfs: simplify swap Dave Kleikamp
2013-01-09 20:01 ` Rik van Riel
2013-01-09 19:58 ` [PATCH V5 23/30] btrfs: add support for read_iter and write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 24/30] block_dev: add support for read_iter, write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 25/30] xfs: add support for read_iter and write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 26/30] gfs2: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-01-10 10:10 ` Steven Whitehouse
2013-01-10 14:34 ` Dave Kleikamp
2013-01-11 16:24 ` Steven Whitehouse
2013-01-09 19:58 ` [PATCH V5 27/30] udf: convert file ops from aio_read/write " Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 28/30] afs: add support for read_iter and write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 29/30] ecrpytfs: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-01-09 19:58 ` [PATCH V5 30/30] ubifs: convert file ops from aio_read/write " Dave Kleikamp
2013-01-10 15:46 ` [PATCH V5 00/30] loop: Issue O_DIRECT aio using bio_vec Sedat Dilek
2013-01-11 21:51 ` Dave Kleikamp
2013-01-12 10:52 ` Sedat Dilek
2013-01-12 12:54 ` Sedat Dilek
2013-01-16 16:32 ` James Bottomley
2013-01-16 18:25 ` Sedat Dilek
2013-01-17 23:49 ` James Bottomley
2013-01-18 17:16 ` Jeff Moyer
2013-01-18 17:56 ` Jeff Moyer
2013-01-18 17:58 ` Dave Kleikamp
2013-01-18 21:48 ` Jeff Moyer
2013-01-18 22:33 ` Dave Kleikamp
2013-01-22 16:14 ` Jeff Moyer
2013-01-20 22:26 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1357761525-22718-11-git-send-email-dave.kleikamp@oracle.com \
--to=dave.kleikamp@oracle.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mpatlasov@parallels.com \
--cc=zab@zabbo.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).