* [PATCH] cleanup blockdev_direct_IO locking
@ 2009-08-03 22:19 Christoph Hellwig
2009-08-03 22:59 ` Eric Sandeen
2009-09-26 19:53 ` Christoph Hellwig
0 siblings, 2 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-08-03 22:19 UTC (permalink / raw)
To: linux-fsdevel
Currently the locking in blockdev_direct_IO is a mess, we have three different
locking types and very confusing checks for some of them. The most
complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
used.
This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
The difference is that DIO_NO_LOCKING always sets the create argument for
the get_blocks callback to zero, but we can easily move that to the actual
get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
gfs already ignores the create argument and thus is fine with the new
version, ocfs2 only errors out if create were ever set, and we can remove
this dead code now, the block device code only ever uses create for an
error message if we are fully beyond the device which can never happen,
and last but not least XFS will need the new behavour for writes.
Once this replace the lock_type variable with a flags one, where no flag
means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
and so far only flag. Also revamp the documentation of the locking scheme
to actually make sense.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/ocfs2/aops.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/aops.c 2009-08-03 21:34:07.194552517 +0200
+++ linux-2.6/fs/ocfs2/aops.c 2009-08-03 21:34:09.191806385 +0200
@@ -545,6 +545,9 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -561,14 +564,6 @@ static int ocfs2_direct_IO_get_blocks(st
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- /*
- * Any write past EOF is not allowed because we'd be extending.
- */
- if (create && (iblock + max_blocks) > inode_blocks) {
- ret = -EIO;
- goto bail;
- }
-
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -580,15 +575,6 @@ static int ocfs2_direct_IO_get_blocks(st
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has a hole at block %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)iblock);
- ret = -EROFS;
- goto bail;
- }
-
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
@@ -597,20 +583,8 @@ static int ocfs2_direct_IO_get_blocks(st
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
- else {
- /*
- * ocfs2_prepare_inode_for_write() should have caught
- * the case where we'd be filling a hole and triggered
- * a buffered write instead.
- */
- if (create) {
- ret = -EIO;
- mlog_errno(ret);
- goto bail;
- }
-
+ else
clear_buffer_mapped(bh_result);
- }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2009-08-03 21:34:07.241790453 +0200
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2009-08-03 21:34:39.339534433 +0200
@@ -1545,19 +1545,13 @@ xfs_vm_direct_IO(
bdev = xfs_find_bdev_for_inode(XFS_I(inode));
- if (rw == WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- } else {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- }
+ iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+ IOMAP_UNWRITTEN : IOMAP_READ);
+
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct);
if (unlikely(ret != -EIOCBQUEUED && iocb->private))
xfs_destroy_ioend(iocb->private);
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c 2009-08-03 21:34:07.246782220 +0200
+++ linux-2.6/fs/direct-io.c 2009-08-03 21:34:51.785784373 +0200
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,7 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
+ int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred,
dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
+
+ if (dio->flags & DIO_LOCKING)
/* lockdep: non-owner release */
up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +509,22 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
+ /*
+ * For writes inside i_size on a DIO_LOCKING filesystem we
+ * forbid block creations: only overwrites are permitted.
+ * For this case we fall back to buffered writes at a higher
+ * level for inside-i_size block-instantiating writes.
+ *
+ * For filesystems with their own locking the decision is
+ * left to the get_blocks method.
+ */
create = dio->rw & WRITE;
- if (dio->lock_type == DIO_LOCKING) {
+ if (dio->flags & DIO_LOCKING) {
if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits))
create = 0;
- } else if (dio->lock_type == DIO_NO_LOCKING) {
- create = 0;
}
- /*
- * For writes inside i_size we forbid block creations: only
- * overwrites are permitted. We fall back to buffered writes
- * at a higher level for inside-i_size block-instantiating
- * writes.
- */
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
@@ -1042,7 +1037,7 @@ direct_io_worker(int rw, struct kiocb *i
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+ if (rw == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
@@ -1086,30 +1081,29 @@ direct_io_worker(int rw, struct kiocb *i
/*
* This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
*
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ * The locking rules are governed by the flags parameter:
+ * - if the flags value contains DIO_LOCKING we use a fancy locking
+ * scheme for dumb filesystems.
+ * For writes this functions is called under i_mutex and return with
+ * i_mutex held, even though it is internally dropped.
+ * For reads, i_mutex is not held on entry, but it is taken and dropped
+ * before returning.
+ * For reads and writes i_alloc_sem is taken in shared mode and released
+ * on I/O completion (which may happen asynchronously after returning to
+ * the caller).
+ *
+ * - if the flags value does NOT contain DIO_LOCKING we don't use any
+ * internal locking but rather rely on the filesystem to synchronize
+ * direct I/O reads/writes versus each other and truncate.
+ * For reads and writes both i_mutex and i_alloc_sem are not held on
+ * entry and are never taken.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+ int flags)
{
int seg;
size_t size;
@@ -1120,8 +1114,6 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1156,43 +1148,30 @@ __blockdev_direct_IO(int rw, struct kioc
if (!dio)
goto out;
- /*
- * For block device access DIO_NO_LOCKING is used,
- * neither readers nor writers do any locking at all
- * For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
- * For regular files using DIO_OWN_LOCKING,
- * neither readers nor writers take any locks here
- */
- dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
+ dio->flags = flags;
+ if (dio->flags & DIO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
- struct address_space *mapping;
+ struct address_space *mapping =
+ iocb->ki_filp->f_mapping;
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
+ /* will be released by direct_io_worker */
+ mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
+ mutex_unlock(&inode->i_mutex);
kfree(dio);
goto out;
}
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
}
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+ /*
+ * Will be released at I/O completion, possible in a
+ * different thread.
+ */
+ down_read_non_owner(&inode->i_alloc_sem);
}
/*
@@ -1210,24 +1189,19 @@ __blockdev_direct_IO(int rw, struct kioc
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
- * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
- * it's own meaner.
+ *
+ * NOTE: filesystems with their own locking have to handle this
+ * on their own.
*/
- if (unlikely(retval < 0 && (rw & WRITE))) {
- loff_t isize = i_size_read(inode);
-
- if (end > isize && dio_lock_type == DIO_LOCKING)
- vmtruncate(inode, isize);
+ if (dio->flags & DIO_LOCKING) {
+ if (unlikely((rw & WRITE) && retval < 0)) {
+ loff_t isize = i_size_read(inode);
+ if (end > isize )
+ vmtruncate(inode, isize);
+ }
}
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2009-08-03 21:34:54.254532317 +0200
+++ linux-2.6/include/linux/fs.h 2009-08-03 21:35:15.455534350 +0200
@@ -2247,8 +2247,6 @@ ssize_t __blockdev_direct_IO(int rw, str
enum {
DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2266,16 +2264,7 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ nr_segs, get_block, end_io, 0);
}
#endif
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] cleanup blockdev_direct_IO locking
2009-08-03 22:19 [PATCH] cleanup blockdev_direct_IO locking Christoph Hellwig
@ 2009-08-03 22:59 ` Eric Sandeen
2009-08-06 21:20 ` Christoph Hellwig
2009-09-26 19:53 ` Christoph Hellwig
1 sibling, 1 reply; 6+ messages in thread
From: Eric Sandeen @ 2009-08-03 22:59 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel
Christoph Hellwig wrote:
> Currently the locking in blockdev_direct_IO is a mess, we have three different
> locking types and very confusing checks for some of them. The most
> complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
> used.
>
> This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
> is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
> The difference is that DIO_NO_LOCKING always sets the create argument for
> the get_blocks callback to zero, but we can easily move that to the actual
> get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
> gfs already ignores the create argument and thus is fine with the new
> version, ocfs2 only errors out if create were ever set, and we can remove
> this dead code now, the block device code only ever uses create for an
> error message if we are fully beyond the device which can never happen,
> and last but not least XFS will need the new behavour for writes.
>
> Once this replace the lock_type variable with a flags one, where no flag
> means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
> and so far only flag. Also revamp the documentation of the locking scheme
> to actually make sense.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
I generally like it, though I'd rather DIO_LOCKING didn't imply
hole-filling ability; I guess it's related but is it 1:1 ? Maybe we can
have a hole-filling-related flag right off the bat. Couple other
comments below inline.
<ocfs2 & xfs snipped>
> Index: linux-2.6/fs/direct-io.c
> ===================================================================
> --- linux-2.6.orig/fs/direct-io.c 2009-08-03 21:34:07.246782220 +0200
> +++ linux-2.6/fs/direct-io.c 2009-08-03 21:34:51.785784373 +0200
> @@ -53,13 +53,6 @@
> *
> * If blkfactor is zero then the user's request was aligned to the filesystem's
> * blocksize.
> - *
> - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
> - * This determines whether we need to do the fancy locking which prevents
> - * direct-IO from being able to read uninitialised disk blocks. If its zero
> - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
> - * not held for the entire direct write (taken briefly, initially, during a
> - * direct read though, but its never held for the duration of a direct-IO).
> */
Yay for consolidating the locking comments.
> struct dio {
> @@ -68,7 +61,7 @@ struct dio {
> struct inode *inode;
> int rw;
> loff_t i_size; /* i_size when submitted */
> - int lock_type; /* doesn't change */
> + int flags; /* doesn't change */
> unsigned blkbits; /* doesn't change */
> unsigned blkfactor; /* When we're using an alignment which
> is finer than the filesystem's soft
> @@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
> if (dio->end_io && dio->result)
> dio->end_io(dio->iocb, offset, transferred,
> dio->map_bh.b_private);
> - if (dio->lock_type == DIO_LOCKING)
> +
> + if (dio->flags & DIO_LOCKING)
> /* lockdep: non-owner release */
> up_read_non_owner(&dio->inode->i_alloc_sem);
>
> @@ -515,21 +509,22 @@ static int get_more_blocks(struct dio *d
> map_bh->b_state = 0;
> map_bh->b_size = fs_count << dio->inode->i_blkbits;
>
> + /*
> + * For writes inside i_size on a DIO_LOCKING filesystem we
> + * forbid block creations: only overwrites are permitted.
> + * For this case we fall back to buffered writes at a higher
> + * level for inside-i_size block-instantiating writes.
maybe:
+ * when an unmapped buffer is returned.
A note that this determination is made by the caller when it gets back
an unmapped bh might help dimwits like me next time I read the code ;)
> + * For filesystems with their own locking the decision is
> + * left to the get_blocks method.
> + */
> create = dio->rw & WRITE;
> - if (dio->lock_type == DIO_LOCKING) {
> + if (dio->flags & DIO_LOCKING) {
> if (dio->block_in_file < (i_size_read(dio->inode) >>
> dio->blkbits))
> create = 0;
> - } else if (dio->lock_type == DIO_NO_LOCKING) {
> - create = 0;
> }
>
> - /*
> - * For writes inside i_size we forbid block creations: only
> - * overwrites are permitted. We fall back to buffered writes
> - * at a higher level for inside-i_size block-instantiating
> - * writes.
> - */
> ret = (*dio->get_block)(dio->inode, fs_startblk,
> map_bh, create);
> }
> @@ -1042,7 +1037,7 @@ direct_io_worker(int rw, struct kiocb *i
> * we can let i_mutex go now that its achieved its purpose
> * of protecting us from looking up uninitialized blocks.
> */
> - if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
> + if (rw == READ && (dio->flags & DIO_LOCKING))
> mutex_unlock(&dio->inode->i_mutex);
>
> /*
> @@ -1086,30 +1081,29 @@ direct_io_worker(int rw, struct kiocb *i
>
> /*
> * This is a library function for use by filesystem drivers.
> - * The locking rules are governed by the dio_lock_type parameter.
> *
> - * DIO_NO_LOCKING (no locking, for raw block device access)
> - * For writes, i_mutex is not held on entry; it is never taken.
> - *
> - * DIO_LOCKING (simple locking for regular files)
> - * For writes we are called under i_mutex and return with i_mutex held, even
> - * though it is internally dropped.
> - * For reads, i_mutex is not held on entry, but it is taken and dropped before
> - * returning.
> - *
> - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
> - * uninitialised data, allowing parallel direct readers and writers)
> - * For writes we are called without i_mutex, return without it, never touch it.
> - * For reads we are called under i_mutex and return with i_mutex held, even
> - * though it may be internally dropped.
> - *
> - * Additional i_alloc_sem locking requirements described inline below.
> + * The locking rules are governed by the flags parameter:
> + * - if the flags value contains DIO_LOCKING we use a fancy locking
> + * scheme for dumb filesystems.
> + * For writes this functions is called under i_mutex and return with
^^function ^^returns
> + * i_mutex held, even though it is internally dropped.
Is it? where? Oh I guess in direct_io_worker.
> + * For reads, i_mutex is not held on entry, but it is taken and dropped
> + * before returning.
It's actually dropped by direct_io_worker, FWIW.
On my initial read of this stuff it sounds like this locking is done in
the function following the comments which confused me.
> + * For reads and writes i_alloc_sem is taken in shared mode and released
> + * on I/O completion (which may happen asynchronously after returning to
> + * the caller).
> + *
> + * - if the flags value does NOT contain DIO_LOCKING we don't use any
> + * internal locking but rather rely on the filesystem to synchronize
> + * direct I/O reads/writes versus each other and truncate.
> + * For reads and writes both i_mutex and i_alloc_sem are not held on
> + * entry and are never taken.
> */
> ssize_t
> __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> struct block_device *bdev, const struct iovec *iov, loff_t offset,
> unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> - int dio_lock_type)
> + int flags)
> {
> int seg;
> size_t size;
> @@ -1120,8 +1114,6 @@ __blockdev_direct_IO(int rw, struct kioc
> ssize_t retval = -EINVAL;
> loff_t end = offset;
> struct dio *dio;
> - int release_i_mutex = 0;
> - int acquire_i_mutex = 0;
>
> if (rw & WRITE)
> rw = WRITE_ODIRECT;
> @@ -1156,43 +1148,30 @@ __blockdev_direct_IO(int rw, struct kioc
> if (!dio)
> goto out;
>
> - /*
> - * For block device access DIO_NO_LOCKING is used,
> - * neither readers nor writers do any locking at all
> - * For regular files using DIO_LOCKING,
> - * readers need to grab i_mutex and i_alloc_sem
> - * writers need to grab i_alloc_sem only (i_mutex is already held)
> - * For regular files using DIO_OWN_LOCKING,
> - * neither readers nor writers take any locks here
> - */
> - dio->lock_type = dio_lock_type;
> - if (dio_lock_type != DIO_NO_LOCKING) {
> + dio->flags = flags;
> + if (dio->flags & DIO_LOCKING) {
> /* watch out for a 0 len io from a tricksy fs */
> if (rw == READ && end > offset) {
> - struct address_space *mapping;
> + struct address_space *mapping =
> + iocb->ki_filp->f_mapping;
>
> - mapping = iocb->ki_filp->f_mapping;
> - if (dio_lock_type != DIO_OWN_LOCKING) {
> - mutex_lock(&inode->i_mutex);
> - release_i_mutex = 1;
> - }
> + /* will be released by direct_io_worker */
ahah there we go
> + mutex_lock(&inode->i_mutex);
>
> retval = filemap_write_and_wait_range(mapping, offset,
> end - 1);
> if (retval) {
> + mutex_unlock(&inode->i_mutex);
> kfree(dio);
> goto out;
> }
> -
> - if (dio_lock_type == DIO_OWN_LOCKING) {
> - mutex_unlock(&inode->i_mutex);
> - acquire_i_mutex = 1;
> - }
> }
>
> - if (dio_lock_type == DIO_LOCKING)
> - /* lockdep: not the owner will release it */
> - down_read_non_owner(&inode->i_alloc_sem);
> + /*
> + * Will be released at I/O completion, possible in a
^^possibly
> + * different thread.
> + */
> + down_read_non_owner(&inode->i_alloc_sem);
> }
>
> /*
> @@ -1210,24 +1189,19 @@ __blockdev_direct_IO(int rw, struct kioc
> /*
> * In case of error extending write may have instantiated a few
> * blocks outside i_size. Trim these off again for DIO_LOCKING.
> - * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
> - * it's own meaner.
> + *
> + * NOTE: filesystems with their own locking have to handle this
> + * on their own.
> */
> - if (unlikely(retval < 0 && (rw & WRITE))) {
> - loff_t isize = i_size_read(inode);
> -
> - if (end > isize && dio_lock_type == DIO_LOCKING)
> - vmtruncate(inode, isize);
> + if (dio->flags & DIO_LOCKING) {
> + if (unlikely((rw & WRITE) && retval < 0)) {
> + loff_t isize = i_size_read(inode);
> + if (end > isize )
> + vmtruncate(inode, isize);
> + }
> }
>
> - if (rw == READ && dio_lock_type == DIO_LOCKING)
> - release_i_mutex = 0;
> -
> out:
> - if (release_i_mutex)
> - mutex_unlock(&inode->i_mutex);
> - else if (acquire_i_mutex)
> - mutex_lock(&inode->i_mutex);
> return retval;
> }
> EXPORT_SYMBOL(__blockdev_direct_IO);
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h 2009-08-03 21:34:54.254532317 +0200
> +++ linux-2.6/include/linux/fs.h 2009-08-03 21:35:15.455534350 +0200
> @@ -2247,8 +2247,6 @@ ssize_t __blockdev_direct_IO(int rw, str
>
> enum {
> DIO_LOCKING = 1, /* need locking between buffered and direct access */
> - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
> - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
> };
If this is meant to be a flag it should probably be
#define DIO_LOCKING 0x1
not an enum - or if it's an enum put it in hex w/ a comment that the
values are for flags. enum to me sorta implies that it's sequential but
maybe it's just my hangup ;)
> static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
> @@ -2266,16 +2264,7 @@ static inline ssize_t blockdev_direct_IO
> dio_iodone_t end_io)
> {
> return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_NO_LOCKING);
> -}
> -
> -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
> - struct inode *inode, struct block_device *bdev, const struct iovec *iov,
> - loff_t offset, unsigned long nr_segs, get_block_t get_block,
> - dio_iodone_t end_io)
> -{
> - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_OWN_LOCKING);
> + nr_segs, get_block, end_io, 0);
> }
> #endif
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] cleanup blockdev_direct_IO locking
2009-08-03 22:59 ` Eric Sandeen
@ 2009-08-06 21:20 ` Christoph Hellwig
2009-08-10 16:46 ` Mingming
0 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2009-08-06 21:20 UTC (permalink / raw)
To: Eric Sandeen; +Cc: Christoph Hellwig, linux-fsdevel
New version with Eric's comments addresses:
--
Subject: cleanup blockdev_direct_IO locking
From: Christoph Hellwig <hch@lst.de>
Currently the locking in blockdev_direct_IO is a mess, we have three different
locking types and very confusing checks for some of them. The most
complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
used.
This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
The difference is that DIO_NO_LOCKING always sets the create argument for
the get_blocks callback to zero, but we can easily move that to the actual
get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
gfs already ignores the create argument and thus is fine with the new
version, ocfs2 only errors out if create were ever set, and we can remove
this dead code now, the block device code only ever uses create for an
error message if we are fully beyond the device which can never happen,
and last but not least XFS will need the new behavour for writes.
Now we can replace the lock_type variable with a flags one, where no flag
means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
flag. Separate out the check for not allowing to fill holes into a separate
flag, although for now both flags always get set at the same time.
Also revamp the documentation of the locking scheme to actually make sense.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/ocfs2/aops.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/aops.c 2009-08-06 17:55:31.432617149 -0300
+++ linux-2.6/fs/ocfs2/aops.c 2009-08-06 17:57:34.113342440 -0300
@@ -545,6 +545,9 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -561,14 +564,6 @@ static int ocfs2_direct_IO_get_blocks(st
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- /*
- * Any write past EOF is not allowed because we'd be extending.
- */
- if (create && (iblock + max_blocks) > inode_blocks) {
- ret = -EIO;
- goto bail;
- }
-
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -580,15 +575,6 @@ static int ocfs2_direct_IO_get_blocks(st
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has a hole at block %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)iblock);
- ret = -EROFS;
- goto bail;
- }
-
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
@@ -597,20 +583,8 @@ static int ocfs2_direct_IO_get_blocks(st
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
- else {
- /*
- * ocfs2_prepare_inode_for_write() should have caught
- * the case where we'd be filling a hole and triggered
- * a buffered write instead.
- */
- if (create) {
- ret = -EIO;
- mlog_errno(ret);
- goto bail;
- }
-
+ else
clear_buffer_mapped(bh_result);
- }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2009-08-06 17:55:31.479592619 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2009-08-06 17:57:34.115365114 -0300
@@ -1545,19 +1545,13 @@ xfs_vm_direct_IO(
bdev = xfs_find_bdev_for_inode(XFS_I(inode));
- if (rw == WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- } else {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- }
+ iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+ IOMAP_UNWRITTEN : IOMAP_READ);
+
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct);
if (unlikely(ret != -EIOCBQUEUED && iocb->private))
xfs_destroy_ioend(iocb->private);
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c 2009-08-06 17:55:31.485592615 -0300
+++ linux-2.6/fs/direct-io.c 2009-08-06 18:07:49.330594770 -0300
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,7 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
+ int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred,
dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
+
+ if (dio->flags & DIO_LOCKING)
/* lockdep: non-owner release */
up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
+ /*
+ * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+ * forbid block creations: only overwrites are permitted.
+ * We will return early to the caller once we see an
+ * unmapped buffer head returned, and the caller will fall
+ * back to buffered I/O.
+ *
+ * Otherwise the decision is left to the get_blocks method,
+ * which may decide to handle it or also return an unmapped
+ * buffer head.
+ */
create = dio->rw & WRITE;
- if (dio->lock_type == DIO_LOCKING) {
+ if (dio->flags & DIO_SKIP_HOLES) {
if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits))
create = 0;
- } else if (dio->lock_type == DIO_NO_LOCKING) {
- create = 0;
}
- /*
- * For writes inside i_size we forbid block creations: only
- * overwrites are permitted. We fall back to buffered writes
- * at a higher level for inside-i_size block-instantiating
- * writes.
- */
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
@@ -1042,7 +1039,7 @@ direct_io_worker(int rw, struct kiocb *i
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+ if (rw == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *i
/*
* This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
*
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ * The locking rules are governed by the flags parameter:
+ * - if the flags value contains DIO_LOCKING we use a fancy locking
+ * scheme for dumb filesystems.
+ * For writes this function is called under i_mutex and returns with
+ * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * taken and dropped again before returning.
+ * For reads and writes i_alloc_sem is taken in shared mode and released
+ * on I/O completion (which may happen asynchronously after returning to
+ * the caller).
+ *
+ * - if the flags value does NOT contain DIO_LOCKING we don't use any
+ * internal locking but rather rely on the filesystem to synchronize
+ * direct I/O reads/writes versus each other and truncate.
+ * For reads and writes both i_mutex and i_alloc_sem are not held on
+ * entry and are never taken.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+ int flags)
{
int seg;
size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kioc
if (!dio)
goto out;
- /*
- * For block device access DIO_NO_LOCKING is used,
- * neither readers nor writers do any locking at all
- * For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
- * For regular files using DIO_OWN_LOCKING,
- * neither readers nor writers take any locks here
- */
- dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
+ dio->flags = flags;
+ if (dio->flags & DIO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
- struct address_space *mapping;
+ struct address_space *mapping =
+ iocb->ki_filp->f_mapping;
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
+ /* will be released by direct_io_worker */
+ mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
+ mutex_unlock(&inode->i_mutex);
kfree(dio);
goto out;
}
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
}
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+ /*
+ * Will be released at I/O completion, possibly in a
+ * different thread.
+ */
+ down_read_non_owner(&inode->i_alloc_sem);
}
/*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kioc
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
- * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
- * it's own meaner.
+ *
+ * NOTE: filesystems with their own locking have to handle this
+ * on their own.
*/
- if (unlikely(retval < 0 && (rw & WRITE))) {
- loff_t isize = i_size_read(inode);
-
- if (end > isize && dio_lock_type == DIO_LOCKING)
- vmtruncate(inode, isize);
+ if (dio->flags & DIO_LOCKING) {
+ if (unlikely((rw & WRITE) && retval < 0)) {
+ loff_t isize = i_size_read(inode);
+ if (end > isize )
+ vmtruncate(inode, isize);
+ }
}
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2009-08-06 17:57:31.395366559 -0300
+++ linux-2.6/include/linux/fs.h 2009-08-06 18:09:28.621594817 -0300
@@ -2246,9 +2246,11 @@ ssize_t __blockdev_direct_IO(int rw, str
int lock_type);
enum {
- DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+ /* need locking between buffered and direct access */
+ DIO_LOCKING = 0x01,
+
+ /* filesystem does not support filling holes */
+ DIO_SKIP_HOLES = 0x02,
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2257,7 +2259,8 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_LOCKING);
+ nr_segs, get_block, end_io,
+ DIO_LOCKING | DIO_SKIP_HOLES);
}
static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2266,16 +2269,7 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ nr_segs, get_block, end_io, 0);
}
#endif
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] cleanup blockdev_direct_IO locking
2009-08-06 21:20 ` Christoph Hellwig
@ 2009-08-10 16:46 ` Mingming
0 siblings, 0 replies; 6+ messages in thread
From: Mingming @ 2009-08-10 16:46 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Eric Sandeen, linux-fsdevel
On Thu, 2009-08-06 at 23:20 +0200, Christoph Hellwig wrote:
> New version with Eric's comments addresses:
>
> --
>
> Subject: cleanup blockdev_direct_IO locking
> From: Christoph Hellwig <hch@lst.de>
>
> Currently the locking in blockdev_direct_IO is a mess, we have three different
> locking types and very confusing checks for some of them. The most
> complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
> used.
>
> This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
> is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
> The difference is that DIO_NO_LOCKING always sets the create argument for
> the get_blocks callback to zero, but we can easily move that to the actual
> get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
> gfs already ignores the create argument and thus is fine with the new
> version, ocfs2 only errors out if create were ever set, and we can remove
> this dead code now, the block device code only ever uses create for an
> error message if we are fully beyond the device which can never happen,
> and last but not least XFS will need the new behavour for writes.
>
> Now we can replace the lock_type variable with a flags one, where no flag
> means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
> flag. Separate out the check for not allowing to fill holes into a separate
> flag, although for now both flags always get set at the same time.
>
> Also revamp the documentation of the locking scheme to actually make sense.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Index: linux-2.6/fs/ocfs2/aops.c
> ===================================================================
> --- linux-2.6.orig/fs/ocfs2/aops.c 2009-08-06 17:55:31.432617149 -0300
> +++ linux-2.6/fs/ocfs2/aops.c 2009-08-06 17:57:34.113342440 -0300
> @@ -545,6 +545,9 @@ bail:
> *
> * called like this: dio->get_blocks(dio->inode, fs_startblk,
> * fs_count, map_bh, dio->rw == WRITE);
> + *
> + * Note that we never bother to allocate blocks here, and thus ignore the
> + * create argument.
> */
> static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
> struct buffer_head *bh_result, int create)
> @@ -561,14 +564,6 @@ static int ocfs2_direct_IO_get_blocks(st
>
> inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
>
> - /*
> - * Any write past EOF is not allowed because we'd be extending.
> - */
> - if (create && (iblock + max_blocks) > inode_blocks) {
> - ret = -EIO;
> - goto bail;
> - }
> -
> /* This figures out the size of the next contiguous block, and
> * our logical offset */
> ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
> @@ -580,15 +575,6 @@ static int ocfs2_direct_IO_get_blocks(st
> goto bail;
> }
>
> - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
> - ocfs2_error(inode->i_sb,
> - "Inode %llu has a hole at block %llu\n",
> - (unsigned long long)OCFS2_I(inode)->ip_blkno,
> - (unsigned long long)iblock);
> - ret = -EROFS;
> - goto bail;
> - }
> -
> /*
> * get_more_blocks() expects us to describe a hole by clearing
> * the mapped bit on bh_result().
> @@ -597,20 +583,8 @@ static int ocfs2_direct_IO_get_blocks(st
> */
> if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
> map_bh(bh_result, inode->i_sb, p_blkno);
> - else {
> - /*
> - * ocfs2_prepare_inode_for_write() should have caught
> - * the case where we'd be filling a hole and triggered
> - * a buffered write instead.
> - */
> - if (create) {
> - ret = -EIO;
> - mlog_errno(ret);
> - goto bail;
> - }
> -
> + else
> clear_buffer_mapped(bh_result);
> - }
>
> /* make sure we don't map more than max_blocks blocks here as
> that's all the kernel will handle at this point. */
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2009-08-06 17:55:31.479592619 -0300
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2009-08-06 17:57:34.115365114 -0300
> @@ -1545,19 +1545,13 @@ xfs_vm_direct_IO(
>
> bdev = xfs_find_bdev_for_inode(XFS_I(inode));
>
> - if (rw == WRITE) {
> - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
> - ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
> - bdev, iov, offset, nr_segs,
> - xfs_get_blocks_direct,
> - xfs_end_io_direct);
> - } else {
> - iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
> - ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
> - bdev, iov, offset, nr_segs,
> - xfs_get_blocks_direct,
> - xfs_end_io_direct);
> - }
> + iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
> + IOMAP_UNWRITTEN : IOMAP_READ);
> +
> + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
> + offset, nr_segs,
> + xfs_get_blocks_direct,
> + xfs_end_io_direct);
>
> if (unlikely(ret != -EIOCBQUEUED && iocb->private))
> xfs_destroy_ioend(iocb->private);
> Index: linux-2.6/fs/direct-io.c
> ===================================================================
> --- linux-2.6.orig/fs/direct-io.c 2009-08-06 17:55:31.485592615 -0300
> +++ linux-2.6/fs/direct-io.c 2009-08-06 18:07:49.330594770 -0300
> @@ -53,13 +53,6 @@
> *
> * If blkfactor is zero then the user's request was aligned to the filesystem's
> * blocksize.
> - *
> - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
> - * This determines whether we need to do the fancy locking which prevents
> - * direct-IO from being able to read uninitialised disk blocks. If its zero
> - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
> - * not held for the entire direct write (taken briefly, initially, during a
> - * direct read though, but its never held for the duration of a direct-IO).
> */
>
> struct dio {
> @@ -68,7 +61,7 @@ struct dio {
> struct inode *inode;
> int rw;
> loff_t i_size; /* i_size when submitted */
> - int lock_type; /* doesn't change */
> + int flags; /* doesn't change */
> unsigned blkbits; /* doesn't change */
> unsigned blkfactor; /* When we're using an alignment which
> is finer than the filesystem's soft
> @@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
> if (dio->end_io && dio->result)
> dio->end_io(dio->iocb, offset, transferred,
> dio->map_bh.b_private);
> - if (dio->lock_type == DIO_LOCKING)
> +
> + if (dio->flags & DIO_LOCKING)
> /* lockdep: non-owner release */
> up_read_non_owner(&dio->inode->i_alloc_sem);
>
> @@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *d
> map_bh->b_state = 0;
> map_bh->b_size = fs_count << dio->inode->i_blkbits;
>
> + /*
> + * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
> + * forbid block creations: only overwrites are permitted.
> + * We will return early to the caller once we see an
> + * unmapped buffer head returned, and the caller will fall
> + * back to buffered I/O.
> + *
> + * Otherwise the decision is left to the get_blocks method,
> + * which may decide to handle it or also return an unmapped
> + * buffer head.
> + */
Thanks for the update.
I realized ext4 is not a pure DIO_FILL_HOLES filesystem, as it also
support old ext3 format file(non-extent based), so it would need the
create flag to set to 0, and the ext4 direct IO get_blocks will decide
whether to fill holes or skip holes, depending on the file is extent
based or not.
I feel it's better to remove the DIOS_SKIP_HOLES flag and comment that
the decision is left to the get_blocks method.
Mingming
> create = dio->rw & WRITE;
> - if (dio->lock_type == DIO_LOCKING) {
> + if (dio->flags & DIO_SKIP_HOLES) {
> if (dio->block_in_file < (i_size_read(dio->inode) >>
> dio->blkbits))
> create = 0;
> - } else if (dio->lock_type == DIO_NO_LOCKING) {
> - create = 0;
> }
>
> - /*
> - * For writes inside i_size we forbid block creations: only
> - * overwrites are permitted. We fall back to buffered writes
> - * at a higher level for inside-i_size block-instantiating
> - * writes.
> - */
> ret = (*dio->get_block)(dio->inode, fs_startblk,
> map_bh, create);
> }
> @@ -1042,7 +1039,7 @@ direct_io_worker(int rw, struct kiocb *i
> * we can let i_mutex go now that its achieved its purpose
> * of protecting us from looking up uninitialized blocks.
> */
> - if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
> + if (rw == READ && (dio->flags & DIO_LOCKING))
> mutex_unlock(&dio->inode->i_mutex);
>
> /*
> @@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *i
>
> /*
> * This is a library function for use by filesystem drivers.
> - * The locking rules are governed by the dio_lock_type parameter.
> *
> - * DIO_NO_LOCKING (no locking, for raw block device access)
> - * For writes, i_mutex is not held on entry; it is never taken.
> - *
> - * DIO_LOCKING (simple locking for regular files)
> - * For writes we are called under i_mutex and return with i_mutex held, even
> - * though it is internally dropped.
> - * For reads, i_mutex is not held on entry, but it is taken and dropped before
> - * returning.
> - *
> - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
> - * uninitialised data, allowing parallel direct readers and writers)
> - * For writes we are called without i_mutex, return without it, never touch it.
> - * For reads we are called under i_mutex and return with i_mutex held, even
> - * though it may be internally dropped.
> - *
> - * Additional i_alloc_sem locking requirements described inline below.
> + * The locking rules are governed by the flags parameter:
> + * - if the flags value contains DIO_LOCKING we use a fancy locking
> + * scheme for dumb filesystems.
> + * For writes this function is called under i_mutex and returns with
> + * i_mutex held, for reads, i_mutex is not held on entry, but it is
> + * taken and dropped again before returning.
> + * For reads and writes i_alloc_sem is taken in shared mode and released
> + * on I/O completion (which may happen asynchronously after returning to
> + * the caller).
> + *
> + * - if the flags value does NOT contain DIO_LOCKING we don't use any
> + * internal locking but rather rely on the filesystem to synchronize
> + * direct I/O reads/writes versus each other and truncate.
> + * For reads and writes both i_mutex and i_alloc_sem are not held on
> + * entry and are never taken.
> */
> ssize_t
> __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> struct block_device *bdev, const struct iovec *iov, loff_t offset,
> unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> - int dio_lock_type)
> + int flags)
> {
> int seg;
> size_t size;
> @@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kioc
> ssize_t retval = -EINVAL;
> loff_t end = offset;
> struct dio *dio;
> - int release_i_mutex = 0;
> - int acquire_i_mutex = 0;
>
> if (rw & WRITE)
> rw = WRITE_ODIRECT;
> @@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kioc
> if (!dio)
> goto out;
>
> - /*
> - * For block device access DIO_NO_LOCKING is used,
> - * neither readers nor writers do any locking at all
> - * For regular files using DIO_LOCKING,
> - * readers need to grab i_mutex and i_alloc_sem
> - * writers need to grab i_alloc_sem only (i_mutex is already held)
> - * For regular files using DIO_OWN_LOCKING,
> - * neither readers nor writers take any locks here
> - */
> - dio->lock_type = dio_lock_type;
> - if (dio_lock_type != DIO_NO_LOCKING) {
> + dio->flags = flags;
> + if (dio->flags & DIO_LOCKING) {
> /* watch out for a 0 len io from a tricksy fs */
> if (rw == READ && end > offset) {
> - struct address_space *mapping;
> + struct address_space *mapping =
> + iocb->ki_filp->f_mapping;
>
> - mapping = iocb->ki_filp->f_mapping;
> - if (dio_lock_type != DIO_OWN_LOCKING) {
> - mutex_lock(&inode->i_mutex);
> - release_i_mutex = 1;
> - }
> + /* will be released by direct_io_worker */
> + mutex_lock(&inode->i_mutex);
>
> retval = filemap_write_and_wait_range(mapping, offset,
> end - 1);
> if (retval) {
> + mutex_unlock(&inode->i_mutex);
> kfree(dio);
> goto out;
> }
> -
> - if (dio_lock_type == DIO_OWN_LOCKING) {
> - mutex_unlock(&inode->i_mutex);
> - acquire_i_mutex = 1;
> - }
> }
>
> - if (dio_lock_type == DIO_LOCKING)
> - /* lockdep: not the owner will release it */
> - down_read_non_owner(&inode->i_alloc_sem);
> + /*
> + * Will be released at I/O completion, possibly in a
> + * different thread.
> + */
> + down_read_non_owner(&inode->i_alloc_sem);
> }
>
> /*
> @@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kioc
> /*
> * In case of error extending write may have instantiated a few
> * blocks outside i_size. Trim these off again for DIO_LOCKING.
> - * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
> - * it's own meaner.
> + *
> + * NOTE: filesystems with their own locking have to handle this
> + * on their own.
> */
> - if (unlikely(retval < 0 && (rw & WRITE))) {
> - loff_t isize = i_size_read(inode);
> -
> - if (end > isize && dio_lock_type == DIO_LOCKING)
> - vmtruncate(inode, isize);
> + if (dio->flags & DIO_LOCKING) {
> + if (unlikely((rw & WRITE) && retval < 0)) {
> + loff_t isize = i_size_read(inode);
> + if (end > isize )
> + vmtruncate(inode, isize);
> + }
> }
>
> - if (rw == READ && dio_lock_type == DIO_LOCKING)
> - release_i_mutex = 0;
> -
> out:
> - if (release_i_mutex)
> - mutex_unlock(&inode->i_mutex);
> - else if (acquire_i_mutex)
> - mutex_lock(&inode->i_mutex);
> return retval;
> }
> EXPORT_SYMBOL(__blockdev_direct_IO);
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h 2009-08-06 17:57:31.395366559 -0300
> +++ linux-2.6/include/linux/fs.h 2009-08-06 18:09:28.621594817 -0300
> @@ -2246,9 +2246,11 @@ ssize_t __blockdev_direct_IO(int rw, str
> int lock_type);
>
> enum {
> - DIO_LOCKING = 1, /* need locking between buffered and direct access */
> - DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
> - DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
> + /* need locking between buffered and direct access */
> + DIO_LOCKING = 0x01,
> +
> + /* filesystem does not support filling holes */
> + DIO_SKIP_HOLES = 0x02,
> };
>
> static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
> @@ -2257,7 +2259,8 @@ static inline ssize_t blockdev_direct_IO
> dio_iodone_t end_io)
> {
> return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_LOCKING);
> + nr_segs, get_block, end_io,
> + DIO_LOCKING | DIO_SKIP_HOLES);
> }
>
> static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
> @@ -2266,16 +2269,7 @@ static inline ssize_t blockdev_direct_IO
> dio_iodone_t end_io)
> {
> return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_NO_LOCKING);
> -}
> -
> -static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
> - struct inode *inode, struct block_device *bdev, const struct iovec *iov,
> - loff_t offset, unsigned long nr_segs, get_block_t get_block,
> - dio_iodone_t end_io)
> -{
> - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> - nr_segs, get_block, end_io, DIO_OWN_LOCKING);
> + nr_segs, get_block, end_io, 0);
> }
> #endif
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] cleanup blockdev_direct_IO locking
2009-08-03 22:19 [PATCH] cleanup blockdev_direct_IO locking Christoph Hellwig
2009-08-03 22:59 ` Eric Sandeen
@ 2009-09-26 19:53 ` Christoph Hellwig
1 sibling, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-09-26 19:53 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
respin due to rejects in the current merge window:
--
Subject: cleanup blockdev_direct_IO locking
From: Christoph Hellwig <hch@lst.de>
Currently the locking in blockdev_direct_IO is a mess, we have three different
locking types and very confusing checks for some of them. The most
complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
used.
This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
The difference is that DIO_NO_LOCKING always sets the create argument for
the get_blocks callback to zero, but we can easily move that to the actual
get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
gfs already ignores the create argument and thus is fine with the new
version, ocfs2 only errors out if create were ever set, and we can remove
this dead code now, the block device code only ever uses create for an
error message if we are fully beyond the device which can never happen,
and last but not least XFS will need the new behavour for writes.
Now we can replace the lock_type variable with a flags one, where no flag
means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
flag. Separate out the check for not allowing to fill holes into a separate
flag, although for now both flags always get set at the same time.
Also revamp the documentation of the locking scheme to actually make sense.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/ocfs2/aops.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/aops.c 2009-09-25 16:57:56.038003774 -0300
+++ linux-2.6/fs/ocfs2/aops.c 2009-09-26 13:49:13.063006329 -0300
@@ -547,6 +547,9 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(st
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- /*
- * Any write past EOF is not allowed because we'd be extending.
- */
- if (create && (iblock + max_blocks) > inode_blocks) {
- ret = -EIO;
- goto bail;
- }
-
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(st
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has a hole at block %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)iblock);
- ret = -EROFS;
- goto bail;
- }
-
/* We should already CoW the refcounted extent. */
BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
/*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(st
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
- else {
- /*
- * ocfs2_prepare_inode_for_write() should have caught
- * the case where we'd be filling a hole and triggered
- * a buffered write instead.
- */
- if (create) {
- ret = -EIO;
- mlog_errno(ret);
- goto bail;
- }
-
+ else
clear_buffer_mapped(bh_result);
- }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2009-09-25 16:57:56.041003946 -0300
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2009-09-26 13:48:46.207005192 -0300
@@ -1544,19 +1544,13 @@ xfs_vm_direct_IO(
bdev = xfs_find_bdev_for_inode(XFS_I(inode));
- if (rw == WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- } else {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- }
+ iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+ IOMAP_UNWRITTEN : IOMAP_READ);
+
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct);
if (unlikely(ret != -EIOCBQUEUED && iocb->private))
xfs_destroy_ioend(iocb->private);
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c 2009-09-02 08:37:31.148714665 -0300
+++ linux-2.6/fs/direct-io.c 2009-09-26 13:48:46.210003968 -0300
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,7 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
+ int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred,
dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
+
+ if (dio->flags & DIO_LOCKING)
/* lockdep: non-owner release */
up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
+ /*
+ * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+ * forbid block creations: only overwrites are permitted.
+ * We will return early to the caller once we see an
+ * unmapped buffer head returned, and the caller will fall
+ * back to buffered I/O.
+ *
+ * Otherwise the decision is left to the get_blocks method,
+ * which may decide to handle it or also return an unmapped
+ * buffer head.
+ */
create = dio->rw & WRITE;
- if (dio->lock_type == DIO_LOCKING) {
+ if (dio->flags & DIO_SKIP_HOLES) {
if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits))
create = 0;
- } else if (dio->lock_type == DIO_NO_LOCKING) {
- create = 0;
}
- /*
- * For writes inside i_size we forbid block creations: only
- * overwrites are permitted. We fall back to buffered writes
- * at a higher level for inside-i_size block-instantiating
- * writes.
- */
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
@@ -1042,7 +1039,7 @@ direct_io_worker(int rw, struct kiocb *i
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+ if (rw == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *i
/*
* This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
*
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ * The locking rules are governed by the flags parameter:
+ * - if the flags value contains DIO_LOCKING we use a fancy locking
+ * scheme for dumb filesystems.
+ * For writes this function is called under i_mutex and returns with
+ * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * taken and dropped again before returning.
+ * For reads and writes i_alloc_sem is taken in shared mode and released
+ * on I/O completion (which may happen asynchronously after returning to
+ * the caller).
+ *
+ * - if the flags value does NOT contain DIO_LOCKING we don't use any
+ * internal locking but rather rely on the filesystem to synchronize
+ * direct I/O reads/writes versus each other and truncate.
+ * For reads and writes both i_mutex and i_alloc_sem are not held on
+ * entry and are never taken.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+ int flags)
{
int seg;
size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kioc
if (!dio)
goto out;
- /*
- * For block device access DIO_NO_LOCKING is used,
- * neither readers nor writers do any locking at all
- * For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
- * For regular files using DIO_OWN_LOCKING,
- * neither readers nor writers take any locks here
- */
- dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
+ dio->flags = flags;
+ if (dio->flags & DIO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
- struct address_space *mapping;
+ struct address_space *mapping =
+ iocb->ki_filp->f_mapping;
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
+ /* will be released by direct_io_worker */
+ mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
+ mutex_unlock(&inode->i_mutex);
kfree(dio);
goto out;
}
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
}
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+ /*
+ * Will be released at I/O completion, possibly in a
+ * different thread.
+ */
+ down_read_non_owner(&inode->i_alloc_sem);
}
/*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kioc
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
- * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
- * it's own meaner.
+ *
+ * NOTE: filesystems with their own locking have to handle this
+ * on their own.
*/
- if (unlikely(retval < 0 && (rw & WRITE))) {
- loff_t isize = i_size_read(inode);
-
- if (end > isize && dio_lock_type == DIO_LOCKING)
- vmtruncate(inode, isize);
+ if (dio->flags & DIO_LOCKING) {
+ if (unlikely((rw & WRITE) && retval < 0)) {
+ loff_t isize = i_size_read(inode);
+ if (end > isize )
+ vmtruncate(inode, isize);
+ }
}
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2009-09-25 16:57:56.047003383 -0300
+++ linux-2.6/include/linux/fs.h 2009-09-26 13:48:46.213004630 -0300
@@ -2261,9 +2261,11 @@ ssize_t __blockdev_direct_IO(int rw, str
int lock_type);
enum {
- DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+ /* need locking between buffered and direct access */
+ DIO_LOCKING = 0x01,
+
+ /* filesystem does not support filling holes */
+ DIO_SKIP_HOLES = 0x02,
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2272,7 +2274,8 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_LOCKING);
+ nr_segs, get_block, end_io,
+ DIO_LOCKING | DIO_SKIP_HOLES);
}
static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2281,16 +2284,7 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ nr_segs, get_block, end_io, 0);
}
#endif
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH] cleanup blockdev_direct_IO locking
@ 2009-11-03 15:44 Christoph Hellwig
0 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2009-11-03 15:44 UTC (permalink / raw)
To: viro; +Cc: akpm, linux-fsdevel
Currently the locking in blockdev_direct_IO is a mess, we have three different
locking types and very confusing checks for some of them. The most
complicated one is DIO_OWN_LOCKING for reads, which happens to not actually be
used.
This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read case
is unused anyway, and the write side is almost identical to DIO_NO_LOCKING.
The difference is that DIO_NO_LOCKING always sets the create argument for
the get_blocks callback to zero, but we can easily move that to the actual
get_blocks callbacks. There are four users of the DIO_NO_LOCKING mode:
gfs already ignores the create argument and thus is fine with the new
version, ocfs2 only errors out if create were ever set, and we can remove
this dead code now, the block device code only ever uses create for an
error message if we are fully beyond the device which can never happen,
and last but not least XFS will need the new behavour for writes.
Now we can replace the lock_type variable with a flags one, where no flag
means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
flag. Separate out the check for not allowing to fill holes into a separate
flag, although for now both flags always get set at the same time.
Also revamp the documentation of the locking scheme to actually make sense.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Index: linux-2.6/fs/ocfs2/aops.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/aops.c 2009-09-30 21:55:31.069261479 +0200
+++ linux-2.6/fs/ocfs2/aops.c 2009-11-03 15:11:23.524004009 +0100
@@ -547,6 +547,9 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(st
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- /*
- * Any write past EOF is not allowed because we'd be extending.
- */
- if (create && (iblock + max_blocks) > inode_blocks) {
- ret = -EIO;
- goto bail;
- }
-
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(st
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has a hole at block %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)iblock);
- ret = -EROFS;
- goto bail;
- }
-
/* We should already CoW the refcounted extent. */
BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
/*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(st
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
- else {
- /*
- * ocfs2_prepare_inode_for_write() should have caught
- * the case where we'd be filling a hole and triggered
- * a buffered write instead.
- */
- if (create) {
- ret = -EIO;
- mlog_errno(ret);
- goto bail;
- }
-
+ else
clear_buffer_mapped(bh_result);
- }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2009-11-03 11:03:20.322276375 +0100
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2009-11-03 15:11:23.525004207 +0100
@@ -1562,19 +1562,13 @@ xfs_vm_direct_IO(
bdev = xfs_find_bdev_for_inode(XFS_I(inode));
- if (rw == WRITE) {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- } else {
- iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- bdev, iov, offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct);
- }
+ iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+ IOMAP_UNWRITTEN : IOMAP_READ);
+
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct);
if (unlikely(ret != -EIOCBQUEUED && iocb->private))
xfs_destroy_ioend(iocb->private);
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c 2009-09-30 21:55:31.086254216 +0200
+++ linux-2.6/fs/direct-io.c 2009-11-03 15:11:23.527033865 +0100
@@ -53,13 +53,6 @@
*
* If blkfactor is zero then the user's request was aligned to the filesystem's
* blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks. If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
*/
struct dio {
@@ -68,7 +61,7 @@ struct dio {
struct inode *inode;
int rw;
loff_t i_size; /* i_size when submitted */
- int lock_type; /* doesn't change */
+ int flags; /* doesn't change */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
@@ -240,7 +233,8 @@ static int dio_complete(struct dio *dio,
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred,
dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
+
+ if (dio->flags & DIO_LOCKING)
/* lockdep: non-owner release */
up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +509,24 @@ static int get_more_blocks(struct dio *d
map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits;
+ /*
+ * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+ * forbid block creations: only overwrites are permitted.
+ * We will return early to the caller once we see an
+ * unmapped buffer head returned, and the caller will fall
+ * back to buffered I/O.
+ *
+ * Otherwise the decision is left to the get_blocks method,
+ * which may decide to handle it or also return an unmapped
+ * buffer head.
+ */
create = dio->rw & WRITE;
- if (dio->lock_type == DIO_LOCKING) {
+ if (dio->flags & DIO_SKIP_HOLES) {
if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits))
create = 0;
- } else if (dio->lock_type == DIO_NO_LOCKING) {
- create = 0;
}
- /*
- * For writes inside i_size we forbid block creations: only
- * overwrites are permitted. We fall back to buffered writes
- * at a higher level for inside-i_size block-instantiating
- * writes.
- */
ret = (*dio->get_block)(dio->inode, fs_startblk,
map_bh, create);
}
@@ -1042,7 +1039,7 @@ direct_io_worker(int rw, struct kiocb *i
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
- if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+ if (rw == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
@@ -1086,30 +1083,28 @@ direct_io_worker(int rw, struct kiocb *i
/*
* This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
*
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
- *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- * uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ * The locking rules are governed by the flags parameter:
+ * - if the flags value contains DIO_LOCKING we use a fancy locking
+ * scheme for dumb filesystems.
+ * For writes this function is called under i_mutex and returns with
+ * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * taken and dropped again before returning.
+ * For reads and writes i_alloc_sem is taken in shared mode and released
+ * on I/O completion (which may happen asynchronously after returning to
+ * the caller).
+ *
+ * - if the flags value does NOT contain DIO_LOCKING we don't use any
+ * internal locking but rather rely on the filesystem to synchronize
+ * direct I/O reads/writes versus each other and truncate.
+ * For reads and writes both i_mutex and i_alloc_sem are not held on
+ * entry and are never taken.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int dio_lock_type)
+ int flags)
{
int seg;
size_t size;
@@ -1120,8 +1115,6 @@ __blockdev_direct_IO(int rw, struct kioc
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
- int release_i_mutex = 0;
- int acquire_i_mutex = 0;
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1156,43 +1149,30 @@ __blockdev_direct_IO(int rw, struct kioc
if (!dio)
goto out;
- /*
- * For block device access DIO_NO_LOCKING is used,
- * neither readers nor writers do any locking at all
- * For regular files using DIO_LOCKING,
- * readers need to grab i_mutex and i_alloc_sem
- * writers need to grab i_alloc_sem only (i_mutex is already held)
- * For regular files using DIO_OWN_LOCKING,
- * neither readers nor writers take any locks here
- */
- dio->lock_type = dio_lock_type;
- if (dio_lock_type != DIO_NO_LOCKING) {
+ dio->flags = flags;
+ if (dio->flags & DIO_LOCKING) {
/* watch out for a 0 len io from a tricksy fs */
if (rw == READ && end > offset) {
- struct address_space *mapping;
+ struct address_space *mapping =
+ iocb->ki_filp->f_mapping;
- mapping = iocb->ki_filp->f_mapping;
- if (dio_lock_type != DIO_OWN_LOCKING) {
- mutex_lock(&inode->i_mutex);
- release_i_mutex = 1;
- }
+ /* will be released by direct_io_worker */
+ mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
+ mutex_unlock(&inode->i_mutex);
kfree(dio);
goto out;
}
-
- if (dio_lock_type == DIO_OWN_LOCKING) {
- mutex_unlock(&inode->i_mutex);
- acquire_i_mutex = 1;
- }
}
- if (dio_lock_type == DIO_LOCKING)
- /* lockdep: not the owner will release it */
- down_read_non_owner(&inode->i_alloc_sem);
+ /*
+ * Will be released at I/O completion, possibly in a
+ * different thread.
+ */
+ down_read_non_owner(&inode->i_alloc_sem);
}
/*
@@ -1210,24 +1190,19 @@ __blockdev_direct_IO(int rw, struct kioc
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again for DIO_LOCKING.
- * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
- * it's own meaner.
+ *
+ * NOTE: filesystems with their own locking have to handle this
+ * on their own.
*/
- if (unlikely(retval < 0 && (rw & WRITE))) {
- loff_t isize = i_size_read(inode);
-
- if (end > isize && dio_lock_type == DIO_LOCKING)
- vmtruncate(inode, isize);
+ if (dio->flags & DIO_LOCKING) {
+ if (unlikely((rw & WRITE) && retval < 0)) {
+ loff_t isize = i_size_read(inode);
+ if (end > isize )
+ vmtruncate(inode, isize);
+ }
}
- if (rw == READ && dio_lock_type == DIO_LOCKING)
- release_i_mutex = 0;
-
out:
- if (release_i_mutex)
- mutex_unlock(&inode->i_mutex);
- else if (acquire_i_mutex)
- mutex_lock(&inode->i_mutex);
return retval;
}
EXPORT_SYMBOL(__blockdev_direct_IO);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2009-10-05 23:29:19.232004491 +0200
+++ linux-2.6/include/linux/fs.h 2009-11-03 15:11:23.531004482 +0100
@@ -2265,9 +2265,11 @@ ssize_t __blockdev_direct_IO(int rw, str
int lock_type);
enum {
- DIO_LOCKING = 1, /* need locking between buffered and direct access */
- DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */
- DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+ /* need locking between buffered and direct access */
+ DIO_LOCKING = 0x01,
+
+ /* filesystem does not support filling holes */
+ DIO_SKIP_HOLES = 0x02,
};
static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2276,7 +2278,8 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_LOCKING);
+ nr_segs, get_block, end_io,
+ DIO_LOCKING | DIO_SKIP_HOLES);
}
static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2285,16 +2288,7 @@ static inline ssize_t blockdev_direct_IO
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
- struct inode *inode, struct block_device *bdev, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_block_t get_block,
- dio_iodone_t end_io)
-{
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+ nr_segs, get_block, end_io, 0);
}
#endif
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2009-11-03 15:45 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-08-03 22:19 [PATCH] cleanup blockdev_direct_IO locking Christoph Hellwig
2009-08-03 22:59 ` Eric Sandeen
2009-08-06 21:20 ` Christoph Hellwig
2009-08-10 16:46 ` Mingming
2009-09-26 19:53 ` Christoph Hellwig
-- strict thread matches above, loose matches on Subject: below --
2009-11-03 15:44 Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).