From: Suparna Bhattacharya <suparna@in.ibm.com>
To: Chris Mason <chris.mason@oracle.com>
Cc: linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org
Subject: Re: [PATCH 4 of 8] Add flags to control direct IO helpers
Date: Wed, 7 Feb 2007 22:38:45 +0530 [thread overview]
Message-ID: <20070207170845.GA13893@in.ibm.com> (raw)
In-Reply-To: <04dd7ddd593e9f147723.1170811969@opti.oraclecorp.com>
On Tue, Feb 06, 2007 at 08:32:49PM -0400, Chris Mason wrote:
> This creates a number of flags so that filesystems can control
> blockdev_direct_IO. It is based on code from Russell Cettelan.
>
> The new flags are:
> DIO_CREATE -- always pass create=1 to get_block on writes. This allows
> DIO to fill holes in the file.
> DIO_PLACEHOLDERS -- use placeholder pages to provide locking against buffered
> io and truncates.
> DIO_DROP_I_MUTEX -- drop i_mutex before starting the mapping, io submission,
> or io waiting. The mutex is still dropped for AIO
> as well.
>
> Some API changes are made so that filesystems can have more control
> over the DIO features.
>
> __blockdev_direct_IO is more or less renamed to blockdev_direct_IO_flags.
> All waiting and invalidating of page cache data is pushed down into
> blockdev_direct_IO_flags (and removed from mm/filemap.c)
>
> direct_io_worker is exported into the wild. Filesystems that want to be
> special can pull out the bits of blockdev_direct_IO_flags they care about
> and then call direct_io_worker directly.
>
> Signed-off-by: Chris Mason <chris.mason@oracle.com>
>
> diff -r 1a7105ab9c19 -r 04dd7ddd593e fs/direct-io.c
> --- a/fs/direct-io.c Tue Feb 06 20:02:55 2007 -0500
> +++ b/fs/direct-io.c Tue Feb 06 20:02:56 2007 -0500
> @@ -1,4 +1,3 @@
> - GFP_KERNEL, 1);
> /*
> * fs/direct-io.c
> *
> @@ -55,13 +54,6 @@
> *
> * If blkfactor is zero then the user's request was aligned to the filesystem's
> * blocksize.
> - *
> - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
> - * This determines whether we need to do the fancy locking which prevents
> - * direct-IO from being able to read uninitialised disk blocks. If its zero
> - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
> - * not held for the entire direct write (taken briefly, initially, during a
> - * direct read though, but its never held for the duration of a direct-IO).
> */
>
> struct dio {
> @@ -70,8 +62,7 @@ struct dio {
> struct inode *inode;
> int rw;
> loff_t i_size; /* i_size when submitted */
> - int lock_type; /* doesn't change */
> - int reacquire_i_mutex; /* should we get i_mutex when done? */
> + unsigned flags; /* locking and get_block flags */
> unsigned blkbits; /* doesn't change */
> unsigned blkfactor; /* When we're using an alignment which
> is finer than the filesystem's soft
> @@ -211,7 +202,7 @@ out:
>
> static void dio_unlock_page_range(struct dio *dio)
> {
> - if (dio->lock_type != DIO_NO_LOCKING) {
> + if (dio->flags & DIO_PLACEHOLDERS) {
> remove_placeholder_pages(dio->inode->i_mapping,
> dio->fspages_start_off,
> dio->fspages_end_off);
> @@ -226,7 +217,7 @@ static int dio_lock_page_range(struct di
> unsigned long max_size;
> int ret = 0;
>
> - if (dio->lock_type == DIO_NO_LOCKING)
> + if (!(dio->flags & DIO_PLACEHOLDERS))
> return 0;
>
> while (index >= dio->fspages_end_off) {
> @@ -310,9 +301,6 @@ static int dio_complete(struct dio *dio,
> dio->map_bh.b_private);
> dio_unlock_page_range(dio);
>
> - if (dio->reacquire_i_mutex)
> - mutex_lock(&dio->inode->i_mutex);
> -
> if (ret == 0)
> ret = dio->page_errors;
> if (ret == 0)
> @@ -597,8 +585,9 @@ static int get_more_blocks(struct dio *d
> map_bh->b_state = 0;
> map_bh->b_size = fs_count << dio->inode->i_blkbits;
>
> - create = dio->rw & WRITE;
> - if (dio->lock_type == DIO_NO_LOCKING)
> + if (dio->flags & DIO_CREATE)
> + create = dio->rw & WRITE;
> + else
> create = 0;
> index = fs_startblk >> (PAGE_CACHE_SHIFT -
> dio->inode->i_blkbits);
> @@ -1014,19 +1003,41 @@ out:
> return ret;
> }
>
> -static ssize_t
> -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
> - const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> +/*
> + * This does all the real work of the direct io. Most filesystems want to
> + * call blockdev_direct_IO_flags instead, but if you have exotic locking
> + * routines you can call this directly.
> + *
> + * The flags parameter is a bitmask of:
> + *
> + * DIO_PLACEHOLDERS (use placeholder pages for locking)
> + * DIO_CREATE (pass create=1 to get_block for filling holes or extending)
A little more explanation about why these options are needed, and examples
of when one would specify each of these options would be good.
Regards
Suparna
> + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
> + */
> +ssize_t
> +direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
> + const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
> - struct dio *dio)
> -{
> - unsigned long user_addr;
> + int is_async, unsigned dioflags)
> +{
> + unsigned long user_addr;
> unsigned long flags;
> int seg;
> ssize_t ret = 0;
> ssize_t ret2;
> size_t bytes;
> -
> + struct dio *dio;
> +
> + if (rw & WRITE)
> + rw = WRITE_SYNC;
> +
> + dio = kmalloc(sizeof(*dio), GFP_KERNEL);
> + ret = -ENOMEM;
> + if (!dio)
> + goto out;
> +
> + dio->flags = dioflags;
> + dio->is_async = is_async;
> dio->bio = NULL;
> dio->inode = inode;
> dio->rw = rw;
> @@ -1057,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *i
> dio->bio_list = NULL;
> dio->waiter = NULL;
>
> - if (dio->lock_type != DIO_NO_LOCKING) {
> + if (dio->flags & DIO_PLACEHOLDERS) {
> dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
> dio->fspages_end_off = dio->fspages_start_off;
>
> @@ -1192,33 +1203,24 @@ direct_io_worker(int rw, struct kiocb *i
> } else
> BUG_ON(ret != -EIOCBQUEUED);
>
> +out:
> return ret;
> }
> -
> -/*
> - * This is a library function for use by filesystem drivers.
> - * The locking rules are governed by the dio_lock_type parameter.
> - *
> - * DIO_NO_LOCKING (no locking, for raw block device access)
> - * For writes, i_mutex is not held on entry; it is never taken.
> - *
> - * DIO_LOCKING (simple locking for regular files)
> - * For writes we are called under i_mutex and return with i_mutex held, even
> - * though it is internally dropped.
> - *
> - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
> - * uninitialised data, allowing parallel direct readers and writers)
> - * For writes we are called without i_mutex, return without it, never touch it.
> - * For reads we are called under i_mutex and return with i_mutex held, even
> - * though it may be internally dropped.
> - *
> - * Additional i_alloc_sem locking requirements described inline below.
> - */
> -ssize_t
> -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> - struct block_device *bdev, const struct iovec *iov, loff_t offset,
> - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> - int dio_lock_type)
> +EXPORT_SYMBOL(direct_io_worker);
> +
> +/*
> + * A utility function fro blockdev_direct_IO_flags, this checks
> + * alignment of a O_DIRECT iovec against filesystem and blockdevice
> + * requirements.
> + *
> + * It returns a blkbits value that will work for the io, and returns the
> + * end offset of the io (via blkbits_ret and end_ret).
> + *
> + * The function returns 0 if everything will work or -EINVAL on error
> + */
> +int check_dio_alignment(struct inode *inode, struct block_device *bdev,
> + const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> + unsigned *blkbits_ret, loff_t *end_ret)
> {
> int seg;
> size_t size;
> @@ -1226,13 +1228,7 @@ __blockdev_direct_IO(int rw, struct kioc
> unsigned blkbits = inode->i_blkbits;
> unsigned bdev_blkbits = 0;
> unsigned blocksize_mask = (1 << blkbits) - 1;
> - ssize_t retval = -EINVAL;
> loff_t end = offset;
> - struct dio *dio;
> - struct address_space *mapping = iocb->ki_filp->f_mapping;
> -
> - if (rw & WRITE)
> - rw = WRITE_SYNC;
>
> if (bdev)
> bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
> @@ -1242,7 +1238,7 @@ __blockdev_direct_IO(int rw, struct kioc
> blkbits = bdev_blkbits;
> blocksize_mask = (1 << blkbits) - 1;
> if (offset & blocksize_mask)
> - goto out;
> + return -EINVAL;
> }
>
> /* Check the memory alignment. Blocks cannot straddle pages */
> @@ -1254,27 +1250,60 @@ __blockdev_direct_IO(int rw, struct kioc
> if (bdev)
> blkbits = bdev_blkbits;
> blocksize_mask = (1 << blkbits) - 1;
> - if ((addr & blocksize_mask) || (size & blocksize_mask))
> - goto out;
> + if ((addr & blocksize_mask) || (size & blocksize_mask))
> + return -EINVAL;
> }
> }
> - dio = kmalloc(sizeof(*dio), GFP_KERNEL);
> - retval = -ENOMEM;
> - if (!dio)
> + *end_ret = end;
> + *blkbits_ret = blkbits;
> + return 0;
> +}
> +EXPORT_SYMBOL(check_dio_alignment);
> +
> +/*
> + * This is a library function for use by filesystem drivers.
> + * The flags parameter is a bitmask of:
> + *
> + * DIO_PLACEHOLDERS (use placeholder pages for locking)
> + * DIO_CREATE (pass create=1 to get_block for filling holes)
> + * DIO_DROP_I_MUTEX (drop inode->i_mutex during writes)
> + */
> +ssize_t
> +blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
> + struct block_device *bdev, const struct iovec *iov, loff_t offset,
> + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> + unsigned flags)
> +{
> + struct address_space *mapping = iocb->ki_filp->f_mapping;
> + unsigned blkbits = 0;
> + ssize_t retval = -EINVAL;
> + loff_t end = 0;
> + int is_async;
> + int grab_i_mutex = 0;
> +
> +
> + if (check_dio_alignment(inode, bdev, iov, offset, nr_segs,
> + &blkbits, &end))
> goto out;
>
> -
> - /*
> - * For block device access DIO_NO_LOCKING is used,
> - * neither readers nor writers do any locking at all
> - * For regular files using DIO_LOCKING,
> - * No locks are taken
> - * For regular files using DIO_OWN_LOCKING,
> - * neither readers nor writers take any locks here
> - */
> - dio->lock_type = dio_lock_type;
> -
> - if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
> + if (rw & WRITE) {
> + /*
> + * If it's a write, unmap all mmappings of the file up-front.
> + * This will cause any pte dirty bits to be propagated into
> + * the pageframes for the subsequent filemap_write_and_wait().
> + */
> + if (mapping_mapped(mapping))
> + unmap_mapping_range(mapping, offset, end - offset, 0);
> + if (end <= i_size_read(inode) && (flags & DIO_DROP_I_MUTEX)) {
> + mutex_unlock(&inode->i_mutex);
> + grab_i_mutex = 1;
> + }
> + }
> + /*
> + * the placeholder code does filemap_write_and_wait, so if we
> + * aren't using placeholders we have to do it here
> + */
> + if (!(flags & DIO_PLACEHOLDERS) && end > offset) {
> retval = filemap_write_and_wait_range(mapping, offset, end - 1);
> if (retval)
> goto out;
> @@ -1286,19 +1315,30 @@ __blockdev_direct_IO(int rw, struct kioc
> * even for AIO, we need to wait for i/o to complete before
> * returning in this case.
> */
> - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
> + is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
> (end > i_size_read(inode)));
>
> - /* if our write is inside i_size, we can drop i_mutex */
> - dio->reacquire_i_mutex = 0;
> - if ((rw & WRITE) && dio_lock_type == DIO_LOCKING &&
> - end <= i_size_read(inode) && is_sync_kiocb(iocb)) {
> - dio->reacquire_i_mutex = 1;
> - mutex_unlock(&inode->i_mutex);
> - }
> retval = direct_io_worker(rw, iocb, inode, iov, offset,
> - nr_segs, blkbits, get_block, end_io, dio);
> + nr_segs, blkbits, get_block, end_io, is_async,
> + flags);
> out:
> + if (grab_i_mutex)
> + mutex_lock(&inode->i_mutex);
> +
> + if ((rw & WRITE) && mapping->nrpages) {
> + int err;
> + /* O_DIRECT is allowed to drop i_mutex, so more data
> + * could have been dirtied by others. Start io one more
> + * time
> + */
> + err = filemap_write_and_wait_range(mapping, offset, end - 1);
> + if (!err)
> + err = invalidate_inode_pages2_range(mapping,
> + offset >> PAGE_CACHE_SHIFT,
> + (end - 1) >> PAGE_CACHE_SHIFT);
> + if (!retval && err)
> + retval = err;
> + }
> return retval;
> }
> -EXPORT_SYMBOL(__blockdev_direct_IO);
> +EXPORT_SYMBOL(blockdev_direct_IO_flags);
> diff -r 1a7105ab9c19 -r 04dd7ddd593e include/linux/fs.h
> --- a/include/linux/fs.h Tue Feb 06 20:02:55 2007 -0500
> +++ b/include/linux/fs.h Tue Feb 06 20:02:56 2007 -0500
> @@ -1776,24 +1776,28 @@ static inline void do_generic_file_read(
> }
>
> #ifdef CONFIG_BLOCK
> -ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> +int check_dio_alignment(struct inode *inode, struct block_device *bdev,
> + const struct iovec *iov, loff_t offset, unsigned long nr_segs,
> + unsigned *blkbits_ret, loff_t *end_ret);
> +
> +ssize_t blockdev_direct_IO_flags(int rw, struct kiocb *iocb, struct inode *inode,
> struct block_device *bdev, const struct iovec *iov, loff_t offset,
> unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> - int lock_type);
> -
> -enum {
> - DIO_LOCKING = 1, /* need locking between buffered and direct access */
> - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
> - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
> -};
> + unsigned int dio_flags);
> +
> +#define DIO_PLACEHOLDERS (1 << 0) /* insert placeholder pages */
> +#define DIO_CREATE (1 << 1) /* pass create=1 to get_block when writing */
> +#define DIO_DROP_I_MUTEX (1 << 2) /* drop i_mutex during writes */
>
> static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
> struct inode *inode, struct block_device *bdev, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs, get_block_t get_block,
> dio_iodone_t end_io)
> {
> - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_LOCKING);
> + /* locking is on, FS wants to fill holes w/get_block */
> + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
> + nr_segs, get_block, end_io, DIO_PLACEHOLDERS |
> + DIO_CREATE | DIO_DROP_I_MUTEX);
> }
>
> static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
> @@ -1801,17 +1805,9 @@ static inline ssize_t blockdev_direct_IO
> loff_t offset, unsigned long nr_segs, get_block_t get_block,
> dio_iodone_t end_io)
> {
> - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_NO_LOCKING);
> -}
> -
> -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
> - struct inode *inode, struct block_device *bdev, const struct iovec *iov,
> - loff_t offset, unsigned long nr_segs, get_block_t get_block,
> - dio_iodone_t end_io)
> -{
> - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_OWN_LOCKING);
> + /* locking is off, create is off */
> + return blockdev_direct_IO_flags(rw, iocb, inode, bdev, iov, offset,
> + nr_segs, get_block, end_io, 0);
> }
> #endif
>
> diff -r 1a7105ab9c19 -r 04dd7ddd593e mm/filemap.c
> --- a/mm/filemap.c Tue Feb 06 20:02:55 2007 -0500
> +++ b/mm/filemap.c Tue Feb 06 20:02:56 2007 -0500
> @@ -40,7 +40,7 @@
>
> #include <asm/mman.h>
>
> -static ssize_t
> +static inline ssize_t
> generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs);
>
> @@ -2817,46 +2817,12 @@ EXPORT_SYMBOL(generic_file_aio_write);
> * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
> * went wrong during pagecache shootdown.
> */
> -static ssize_t
> +static inline ssize_t
> generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
> loff_t offset, unsigned long nr_segs)
> {
> - struct file *file = iocb->ki_filp;
> - struct address_space *mapping = file->f_mapping;
> - ssize_t retval;
> - size_t write_len = 0;
> -
> - /*
> - * If it's a write, unmap all mmappings of the file up-front. This
> - * will cause any pte dirty bits to be propagated into the pageframes
> - * for the subsequent filemap_write_and_wait().
> - */
> - if (rw == WRITE) {
> - write_len = iov_length(iov, nr_segs);
> - if (mapping_mapped(mapping))
> - unmap_mapping_range(mapping, offset, write_len, 0);
> - }
> -
> - retval = mapping->a_ops->direct_IO(rw, iocb, iov,
> - offset, nr_segs);
> - if (rw == WRITE && mapping->nrpages) {
> - int err;
> - pgoff_t end = (offset + write_len - 1)
> - >> PAGE_CACHE_SHIFT;
> -
> - /* O_DIRECT is allowed to drop i_mutex, so more data
> - * could have been dirtied by others. Start io one more
> - * time
> - */
> - err = filemap_fdatawrite_range(mapping, offset,
> - offset + write_len - 1);
> - if (!err)
> - err = invalidate_inode_pages2_range(mapping,
> - offset >> PAGE_CACHE_SHIFT, end);
> - if (err)
> - retval = err;
> - }
> - return retval;
> + return iocb->ki_filp->f_mapping->a_ops->direct_IO(rw, iocb, iov,
> + offset, nr_segs);
> }
>
> /**
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India
next prev parent reply other threads:[~2007-02-07 17:03 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-02-07 0:32 [RFC PATCH 0 of 8] O_DIRECT locking rework Chris Mason
2007-02-07 0:32 ` [PATCH 1 of 8] Introduce a place holder page for the pagecache Chris Mason
2007-02-07 17:36 ` Zach Brown
2007-02-07 0:32 ` [PATCH 2 of 8] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking Chris Mason
2007-02-07 20:11 ` Zach Brown
2007-02-07 20:22 ` Chris Mason
2007-02-07 20:34 ` Zach Brown
2007-02-07 0:32 ` [PATCH 3 of 8] DIO: don't fall back to buffered writes Chris Mason
2007-02-07 0:32 ` [PATCH 4 of 8] Add flags to control direct IO helpers Chris Mason
2007-02-07 17:08 ` Suparna Bhattacharya [this message]
2007-02-07 18:05 ` Chris Mason
2007-02-08 4:03 ` Suparna Bhattacharya
2007-02-08 12:58 ` Chris Mason
2007-02-07 0:32 ` [PATCH 5 of 8] Make ext3 safe for the new DIO locking rules Chris Mason
2007-02-07 0:32 ` [PATCH 6 of 8] Make reiserfs safe for " Chris Mason
2007-02-07 0:32 ` [PATCH 7 of 8] Adapt XFS to the new blockdev_direct_IO calls Chris Mason
2007-02-07 0:32 ` [PATCH 8 of 8] Avoid too many boundary buffers in DIO Chris Mason
-- strict thread matches above, loose matches on Subject: below --
2006-12-22 1:45 [PATCH 0 of 8] O_DIRECT locking rework v5 Chris Mason
2006-12-22 1:59 ` [PATCH 4 of 8] Add flags to control direct IO helpers Chris Mason
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070207170845.GA13893@in.ibm.com \
--to=suparna@in.ibm.com \
--cc=akpm@linux-foundation.org \
--cc=chris.mason@oracle.com \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).