* [PATCH 0/2 v2] Fix O_SYNC AIO DIO
@ 2013-08-14 9:10 Jan Kara
2013-08-14 9:10 ` [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Jan Kara @ 2013-08-14 9:10 UTC (permalink / raw)
To: linux-fsdevel; +Cc: linux-ext4, xfs, Christoph Hellwig, Al Viro, Jan Kara
Hello,
this is second iteration of patches to fix handling of O_SYNC AIO DIO.
Since previous version I've addressed Dave's comments:
- slightly expanded changelog of the first patch
- workqueue is now created with parameters allowing paralelism
- workqueue name contains sb->s_id
- workqueue is created on demand (I decided to do this to reduce the overhead
in unnecessary cases)
The patchset survives xfstests run for ext4 & xfs so it should be sane. Since
this touches several filesystems (although only ext4 & xfs are non-trivial),
the question is who should carry these patches. Maybe Al? But since xfs and
ext4 changes are non-trivial, I'd like to have a review from their
developers...
Honza
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/2] direct-io: Implement generic deferred AIO completions
2013-08-14 9:10 [PATCH 0/2 v2] Fix O_SYNC AIO DIO Jan Kara
@ 2013-08-14 9:10 ` Jan Kara
2013-08-14 9:10 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
2013-08-30 15:53 ` [PATCH 0/2 v2] Fix O_SYNC AIO DIO Al Viro
2 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-08-14 9:10 UTC (permalink / raw)
To: linux-fsdevel
Cc: Jan Kara, xfs, Christoph Hellwig, Al Viro, linux-ext4,
Christoph Hellwig
From: Christoph Hellwig <hch@infradead.org>
Add support to the core direct-io code to defer AIO completions to user
context using a workqueue. This replaces opencoded and less efficient
code in XFS and ext4 (we save a memory allocation for each direct IO)
and will be needed to properly support O_(D)SYNC for AIO.
The communication between the filesystem and the direct I/O code requires
a new buffer head flag, which is a bit ugly but not avoidable until the
direct I/O code stops abusing the buffer_head structure for communicating
with the filesystems.
Currently this creates a per-superblock unbound workqueue for these
completions, which is taken from an earlier patch by Jan Kara. I'm
not really convinced about this use and would prefer a "normal" global
workqueue with a high concurrency limit, but this needs further discussion.
JK: Fixed ext4 part, dynamic allocation of the workqueue.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/direct-io.c | 85 ++++++++++++++++++++++++++++++++++++---------
fs/ext4/ext4.h | 11 ------
fs/ext4/inode.c | 28 ++++-----------
fs/ext4/page-io.c | 30 ++++------------
fs/ext4/super.c | 16 ---------
fs/ocfs2/aops.c | 8 +----
fs/super.c | 18 +++++-----
fs/xfs/xfs_aops.c | 28 +++------------
fs/xfs/xfs_aops.h | 3 --
include/linux/buffer_head.h | 2 ++
include/linux/fs.h | 7 ++--
11 files changed, 105 insertions(+), 131 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5..8b31b9f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ bool defer_completion; /* defer AIO completion to workqueue? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
* allocation time. Don't add new fields after pages[] unless you
* wish that they not be zeroed.
*/
- struct page *pages[DIO_PAGES]; /* page buffer */
+ union {
+ struct page *pages[DIO_PAGES]; /* page buffer */
+ struct work_struct complete_work;/* deferred AIO completion */
+ };
} ____cacheline_aligned_in_smp;
static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
*
- * This releases locks as dictated by the locking type, lets interested parties
- * know that a DIO operation has completed, and calculates the resulting return
- * code for the operation.
+ * This drops i_dio_count, lets interested parties know that a DIO operation
+ * has completed, and calculates the resulting return code for the operation.
*
* It lets the filesystem know if it registered an interest earlier via
* get_block. Pass the private field of the map buffer_head so that
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+ bool is_async)
{
ssize_t transferred = 0;
@@ -258,19 +262,26 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result) {
- dio->end_io(dio->iocb, offset, transferred,
- dio->private, ret, is_async);
- } else {
- inode_dio_done(dio->inode);
- if (is_async)
- aio_complete(dio->iocb, ret, 0);
- }
+ if (dio->end_io && dio->result)
+ dio->end_io(dio->iocb, offset, transferred, dio->private);
+
+ inode_dio_done(dio->inode);
+ if (is_async)
+ aio_complete(dio->iocb, ret, 0);
+ kmem_cache_free(dio_cache, dio);
return ret;
}
+static void dio_aio_complete_work(struct work_struct *work)
+{
+ struct dio *dio = container_of(work, struct dio, complete_work);
+
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
+
static int dio_bio_complete(struct dio *dio, struct bio *bio);
+
/*
* Asynchronous IO callback.
*/
@@ -290,8 +301,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kmem_cache_free(dio_cache, dio);
+ if (dio->result && dio->defer_completion) {
+ INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+ queue_work(dio->inode->i_sb->s_dio_done_wq,
+ &dio->complete_work);
+ } else {
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ }
}
}
@@ -511,6 +527,41 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
}
/*
+ * Create workqueue for deferred direct IO completions. We allocate the
+ * workqueue when it's first needed. This avoids creating workqueue for
+ * filesystems that don't need it and also allows us to create the workqueue
+ * late enough so the we can include s_id in the name of the workqueue.
+ */
+static int sb_init_dio_done_wq(struct super_block *sb)
+{
+ struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+ WQ_MEM_RECLAIM, 0,
+ sb->s_id);
+ if (!wq)
+ return -ENOMEM;
+ /*
+ * This has to be atomic as more DIOs can race to create the workqueue
+ */
+ cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+ /* Someone created workqueue before us? Free ours... */
+ if (wq != sb->s_dio_done_wq)
+ destroy_workqueue(wq);
+ return 0;
+}
+
+static int dio_set_defer_completion(struct dio *dio)
+{
+ struct super_block *sb = dio->inode->i_sb;
+
+ if (dio->defer_completion)
+ return 0;
+ dio->defer_completion = true;
+ if (!sb->s_dio_done_wq)
+ return sb_init_dio_done_wq(sb);
+ return 0;
+}
+
+/*
* Call into the fs to map some more disk blocks. We record the current number
* of available blocks at sdio->blocks_available. These are in units of the
* fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +632,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
/* Store for completion */
dio->private = map_bh->b_private;
+
+ if (ret == 0 && buffer_defer_completion(map_bh))
+ ret = dio_set_defer_completion(dio);
}
return ret;
}
@@ -1269,7 +1323,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (drop_refcount(dio) == 0) {
retval = dio_complete(dio, offset, retval, false);
- kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b577e45..bb64697 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -180,7 +180,6 @@ struct ext4_map_blocks {
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_DIRECT 0x0002
/*
* For converting uninitialized extents on a work queue. 'handle' is used for
@@ -196,8 +195,6 @@ typedef struct ext4_io_end {
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t;
@@ -900,11 +897,9 @@ struct ext4_inode_info {
* Completed IOs that need unwritten extents handling and don't have
* transaction reserved
*/
- struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
- struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1276,8 +1271,6 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for unreserved extent convertions (dio) */
- struct workqueue_struct *unrsv_conversion_wq;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
@@ -1340,9 +1333,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- /* Writeback has to have coversion transaction reserved */
- WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
- !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -2715,7 +2705,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
-extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd32a2e..3780ce6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -727,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -2991,19 +2995,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
- if (!io_end) {
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
+ if (!io_end)
return;
- }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3013,11 +3011,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- ext4_put_io_end_defer(io_end);
+ ext4_put_io_end(io_end);
}
/*
@@ -3102,7 +3096,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ret = -ENOMEM;
goto retake_lock;
}
- io_end->flag |= EXT4_IO_END_DIRECT;
/*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
@@ -3147,13 +3140,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- WARN_ON(io_end->iocb);
- /*
- * Generic code already did inode_dio_done() so we
- * have to clear EXT4_IO_END_DIRECT to not do it for
- * the second time.
- */
- io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 6625d21..d7d0c7b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
- if (io_end->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(io_end->inode);
- if (io_end->iocb)
- aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io_end->handle) {
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
- if (list_empty(&ei->i_rsv_conversion_list))
- queue_work(wq, &ei->i_rsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
- } else {
- wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
- if (list_empty(&ei->i_unrsv_conversion_list))
- queue_work(wq, &ei->i_unrsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
- }
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
@@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-void ext4_end_io_unrsv_work(struct work_struct *work)
-{
- struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unrsv_conversion_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-}
-
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 36b141e..85c034f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->unrsv_conversion_wq);
flush_workqueue(sbi->rsv_conversion_wq);
- destroy_workqueue(sbi->unrsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
@@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
- INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
- INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -3954,14 +3950,6 @@ no_journal:
goto failed_mount4;
}
- EXT4_SB(sb)->unrsv_conversion_wq =
- alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->unrsv_conversion_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
- goto failed_mount4;
- }
-
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -4115,8 +4103,6 @@ failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- if (EXT4_SB(sb)->unrsv_conversion_wq)
- destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -4564,7 +4550,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
- flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
@@ -4600,7 +4585,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
dquot_writeback_dquots(sb, -1);
if (wait && test_opt(sb, BARRIER))
ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 79736a2..a533593 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -565,9 +565,7 @@ bail:
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
@@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
-
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
}
/*
diff --git a/fs/super.c b/fs/super.c
index 68307c0..d961ae1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -152,15 +152,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
static const struct super_operations default_op;
if (s) {
- if (security_sb_alloc(s)) {
- /*
- * We cannot call security_sb_free() without
- * security_sb_alloc() succeeding. So bail out manually
- */
- kfree(s);
- s = NULL;
- goto out;
- }
+ if (security_sb_alloc(s))
+ goto out_free_sb;
+
#ifdef CONFIG_SMP
s->s_files = alloc_percpu(struct list_head);
if (!s->s_files)
@@ -228,6 +222,7 @@ err_out:
free_percpu(s->s_files);
#endif
destroy_sb_writers(s);
+out_free_sb:
kfree(s);
s = NULL;
goto out;
@@ -412,6 +407,11 @@ void generic_shutdown_super(struct super_block *sb)
fsnotify_unmount_inodes(&sb->s_inodes);
+ if (sb->s_dio_done_wq) {
+ destroy_workqueue(sb->s_dio_done_wq);
+ sb->s_dio_done_wq = NULL;
+ }
+
evict_inodes(sb);
if (sop->put_super)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 596ec71..e11d654 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,14 +86,6 @@ xfs_destroy_ioend(
bh->b_end_io(bh, !ioend->io_error);
}
- if (ioend->io_iocb) {
- inode_dio_done(ioend->io_inode);
- if (ioend->io_isasync) {
- aio_complete(ioend->io_iocb, ioend->io_error ?
- ioend->io_error : ioend->io_result, 0);
- }
- }
-
mempool_free(ioend, xfs_ioend_pool);
}
@@ -281,7 +273,6 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
- ioend->io_isasync = 0;
ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
@@ -291,8 +282,6 @@ xfs_alloc_ioend(
ioend->io_buffer_tail = NULL;
ioend->io_offset = 0;
ioend->io_size = 0;
- ioend->io_iocb = NULL;
- ioend->io_result = 0;
ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -1292,8 +1281,10 @@ __xfs_get_blocks(
if (create || !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (create && ISUNWRITTEN(&imap)) {
- if (direct)
+ if (direct) {
bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
set_buffer_unwritten(bh_result);
}
}
@@ -1390,9 +1381,7 @@ xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct xfs_ioend *ioend = iocb->private;
@@ -1414,17 +1403,10 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = XFS_IO_UNWRITTEN;
- if (is_async) {
- ioend->io_isasync = 1;
- xfs_finish_ioend(ioend);
- } else {
- xfs_finish_ioend_sync(ioend);
- }
+ xfs_finish_ioend_sync(ioend);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb..f94dd45 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- unsigned int io_isasync : 1; /* needs aio_complete */
unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
- struct kiocb *io_iocb;
- int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 91fa9a9..d77797a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -36,6 +36,7 @@ enum bh_state_bits {
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
+ BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -128,6 +129,7 @@ BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
+BUFFER_FNS(Defer_Completion, defer_completion)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9818747..913aeee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -46,6 +46,7 @@ struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
+struct workqueue_struct;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -63,8 +64,7 @@ struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
- ssize_t bytes, void *private, int ret,
- bool is_async);
+ ssize_t bytes, void *private);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
@@ -1328,6 +1328,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
};
/* superblock cache pruning functions */
--
1.8.1.4
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO
2013-08-14 9:10 [PATCH 0/2 v2] Fix O_SYNC AIO DIO Jan Kara
2013-08-14 9:10 ` [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
@ 2013-08-14 9:10 ` Jan Kara
2013-08-30 15:53 ` [PATCH 0/2 v2] Fix O_SYNC AIO DIO Al Viro
2 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-08-14 9:10 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-ext4, xfs, Christoph Hellwig, Al Viro, Christoph Hellwig,
Jan Kara
From: Christoph Hellwig <hch@infradead.org>
Call generic_write_sync() from the deferred I/O completion handler if
O_DSYNC is set for a write request. Also make sure various callers
don't call generic_write_sync if the direct I/O code returns
-EIOCBQUEUED.
Based on an earlier patch from Jan Kara <jack@suse.cz> with updates from
Jeff Moyer <jmoyer@redhat.com> and Darrick J. Wong <darrick.wong@oracle.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/block_dev.c | 2 +-
fs/btrfs/file.c | 2 +-
fs/cifs/file.c | 2 +-
fs/direct-io.c | 45 ++++++++++++++++++++++++++++++++++++---------
fs/ext4/file.c | 2 +-
mm/filemap.c | 2 +-
6 files changed, 41 insertions(+), 14 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7bda5c..1173a4e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8e686a4..4d2eb64 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1727,7 +1727,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
- if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e57f36..d2653ee 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2552,7 +2552,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
mutex_unlock(&inode->i_mutex);
}
- if (rc > 0 || rc == -EIOCBQUEUED) {
+ if (rc > 0) {
ssize_t err;
err = generic_write_sync(file, pos, rc);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b31b9f..1782023 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -266,8 +266,18 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
dio->end_io(dio->iocb, offset, transferred, dio->private);
inode_dio_done(dio->inode);
- if (is_async)
+ if (is_async) {
+ if (dio->rw & WRITE) {
+ int err;
+
+ err = generic_write_sync(dio->iocb->ki_filp, offset,
+ transferred);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+
aio_complete(dio->iocb, ret, 0);
+ }
kmem_cache_free(dio_cache, dio);
return ret;
@@ -1183,11 +1193,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/*
- * Will be decremented at I/O completion time.
- */
- atomic_inc(&inode->i_dio_count);
-
- /*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
* even for AIO, we need to wait for i/o to complete before
@@ -1195,11 +1200,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
*/
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
-
- retval = 0;
-
dio->inode = inode;
dio->rw = rw;
+
+ /*
+ * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+ * so that we can call ->fsync.
+ */
+ if (dio->is_async && (rw & WRITE) &&
+ ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+ retval = dio_set_defer_completion(dio);
+ if (retval) {
+ /*
+ * We grab i_mutex only for reads so we don't have
+ * to release it here
+ */
+ kmem_cache_free(dio_cache, dio);
+ goto out;
+ }
+ }
+
+ /*
+ * Will be decremented at I/O completion time.
+ */
+ atomic_inc(&inode->i_dio_count);
+
+ retval = 0;
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6f4cc56..bb9efc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4b51ac1..731a2c2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2550,7 +2550,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 0/2 v2] Fix O_SYNC AIO DIO
2013-08-14 9:10 [PATCH 0/2 v2] Fix O_SYNC AIO DIO Jan Kara
2013-08-14 9:10 ` [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
2013-08-14 9:10 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
@ 2013-08-30 15:53 ` Al Viro
2013-09-04 10:54 ` Jan Kara
2 siblings, 1 reply; 7+ messages in thread
From: Al Viro @ 2013-08-30 15:53 UTC (permalink / raw)
To: Jan Kara; +Cc: linux-fsdevel, linux-ext4, xfs, Christoph Hellwig
On Wed, Aug 14, 2013 at 11:10:54AM +0200, Jan Kara wrote:
> Hello,
>
> this is second iteration of patches to fix handling of O_SYNC AIO DIO.
> Since previous version I've addressed Dave's comments:
> - slightly expanded changelog of the first patch
> - workqueue is now created with parameters allowing paralelism
> - workqueue name contains sb->s_id
> - workqueue is created on demand (I decided to do this to reduce the overhead
> in unnecessary cases)
>
> The patchset survives xfstests run for ext4 & xfs so it should be sane. Since
> this touches several filesystems (although only ext4 & xfs are non-trivial),
> the question is who should carry these patches. Maybe Al? But since xfs and
> ext4 changes are non-trivial, I'd like to have a review from their
> developers...
Looks sane, except that I'd probably put destroying the queue after
evict_inodes(), next to ->put_super() call.
Said that, there's another interesting problem in the code affected by that
sucker: generic_file_aio_write() might very well sync the wrong range.
Consider O_APPEND case; __generic_file_aio_write() will call
generic_write_checks(), which will update its copy of pos, and proceed to
write starting from there. All right and proper, but then we return into
generic_file_aio_write() and sync the range of the right length, starting
at the *original* value of pos...
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 0/2 v2] Fix O_SYNC AIO DIO
2013-08-30 15:53 ` [PATCH 0/2 v2] Fix O_SYNC AIO DIO Al Viro
@ 2013-09-04 10:54 ` Jan Kara
0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-09-04 10:54 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel, Christoph Hellwig, linux-ext4, Jan Kara, xfs
On Fri 30-08-13 16:53:01, Al Viro wrote:
> On Wed, Aug 14, 2013 at 11:10:54AM +0200, Jan Kara wrote:
> > Hello,
> >
> > this is second iteration of patches to fix handling of O_SYNC AIO DIO.
> > Since previous version I've addressed Dave's comments:
> > - slightly expanded changelog of the first patch
> > - workqueue is now created with parameters allowing paralelism
> > - workqueue name contains sb->s_id
> > - workqueue is created on demand (I decided to do this to reduce the overhead
> > in unnecessary cases)
> >
> > The patchset survives xfstests run for ext4 & xfs so it should be sane. Since
> > this touches several filesystems (although only ext4 & xfs are non-trivial),
> > the question is who should carry these patches. Maybe Al? But since xfs and
> > ext4 changes are non-trivial, I'd like to have a review from their
> > developers...
>
> Looks sane, except that I'd probably put destroying the queue after
> evict_inodes(), next to ->put_super() call.
OK, I've changed that. I'll send v3 in a moment.
> Said that, there's another interesting problem in the code affected by that
> sucker: generic_file_aio_write() might very well sync the wrong range.
> Consider O_APPEND case; __generic_file_aio_write() will call
> generic_write_checks(), which will update its copy of pos, and proceed to
> write starting from there. All right and proper, but then we return into
> generic_file_aio_write() and sync the range of the right length, starting
> at the *original* value of pos...
Yes, that looks like a bug. I was looking into how we could fix that and
the easiest seems to be to move generic_segment_checks() and
generic_write_checks() from __generic_file_aio_write() to
generic_file_aio_write(). There are only three callers of
__generic_file_aio_write(). cifs_writev() which can and should use
generic_file_aio_write() anyway, ext4_file_dio_write() which could use
generic_file_aio_write() if we cleaned up the code and moved it around a
bit, and blkdev_aio_write() which really needs to call
__generic_file_aio_write() (it doesn't want to grab i_mutex). So that last
caller would need to do the moved checks manually.
But this all seems a bit complex so I'd prefer to do it as a separate
series.
Honza
--
Jan Kara <jack@suse.cz>
SUSE Labs, CR
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 0/2 v3] Fix O_SYNC AIO DIO
@ 2013-09-04 13:04 Jan Kara
2013-09-04 13:04 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
0 siblings, 1 reply; 7+ messages in thread
From: Jan Kara @ 2013-09-04 13:04 UTC (permalink / raw)
To: Al Viro
Cc: Jan Kara, xfs, Christoph Hellwig, dchinner, linux-fsdevel,
linux-ext4
Hello,
this is the third iteration of patches to fix handling of O_SYNC AIO DIO.
Changes since v2:
- moved call for destruction of workqueue closer to put_super
Changes since v1:
- slightly expanded changelog of the first patch
- workqueue is now created with parameters allowing paralelism
- workqueue name contains sb->s_id
- workqueue is created on demand (I decided to do this to reduce the overhead
in unnecessary cases)
The patchset survives xfstests run for ext4 & xfs so it should be sane. Al,
please merge the patches (but I'd also appreciate if Dave or some other XFS
developers gave their ack to the patches since XFS changes are non-trivial).
Honza
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO
2013-09-04 13:04 [PATCH 0/2 v3] " Jan Kara
@ 2013-09-04 13:04 ` Jan Kara
0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-09-04 13:04 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, linux-ext4, xfs, Christoph Hellwig, dchinner,
Christoph Hellwig, Jan Kara
From: Christoph Hellwig <hch@infradead.org>
Call generic_write_sync() from the deferred I/O completion handler if
O_DSYNC is set for a write request. Also make sure various callers
don't call generic_write_sync if the direct I/O code returns
-EIOCBQUEUED.
Based on an earlier patch from Jan Kara <jack@suse.cz> with updates from
Jeff Moyer <jmoyer@redhat.com> and Darrick J. Wong <darrick.wong@oracle.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/block_dev.c | 2 +-
fs/btrfs/file.c | 2 +-
fs/cifs/file.c | 2 +-
fs/direct-io.c | 45 ++++++++++++++++++++++++++++++++++++---------
fs/ext4/file.c | 2 +-
mm/filemap.c | 2 +-
6 files changed, 41 insertions(+), 14 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7bda5c..1173a4e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8e686a4..4d2eb64 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1727,7 +1727,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
- if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e57f36..d2653ee 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2552,7 +2552,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
mutex_unlock(&inode->i_mutex);
}
- if (rc > 0 || rc == -EIOCBQUEUED) {
+ if (rc > 0) {
ssize_t err;
err = generic_write_sync(file, pos, rc);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b31b9f..1782023 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -266,8 +266,18 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
dio->end_io(dio->iocb, offset, transferred, dio->private);
inode_dio_done(dio->inode);
- if (is_async)
+ if (is_async) {
+ if (dio->rw & WRITE) {
+ int err;
+
+ err = generic_write_sync(dio->iocb->ki_filp, offset,
+ transferred);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+
aio_complete(dio->iocb, ret, 0);
+ }
kmem_cache_free(dio_cache, dio);
return ret;
@@ -1183,11 +1193,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/*
- * Will be decremented at I/O completion time.
- */
- atomic_inc(&inode->i_dio_count);
-
- /*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
* even for AIO, we need to wait for i/o to complete before
@@ -1195,11 +1200,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
*/
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
-
- retval = 0;
-
dio->inode = inode;
dio->rw = rw;
+
+ /*
+ * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+ * so that we can call ->fsync.
+ */
+ if (dio->is_async && (rw & WRITE) &&
+ ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+ retval = dio_set_defer_completion(dio);
+ if (retval) {
+ /*
+ * We grab i_mutex only for reads so we don't have
+ * to release it here
+ */
+ kmem_cache_free(dio_cache, dio);
+ goto out;
+ }
+ }
+
+ /*
+ * Will be decremented at I/O completion time.
+ */
+ atomic_inc(&inode->i_dio_count);
+
+ retval = 0;
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6f4cc56..bb9efc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4b51ac1..731a2c2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2550,7 +2550,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 1/2] direct-io: Implement generic deferred AIO completions
@ 2013-07-10 22:02 Jan Kara
2013-07-10 22:02 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
0 siblings, 1 reply; 7+ messages in thread
From: Jan Kara @ 2013-07-10 22:02 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-ext4, xfs, hch, Jeff Moyer, Al Viro, Christoph Hellwig,
Jan Kara
From: Christoph Hellwig <hch@infradead.org>
Add support to the core direct-io code to defer AIO completions to user
context using a workqueue. This replaces opencoded and less efficient
code in XFS and ext4 and will be needed to properly support O_(D)SYNC
for AIO.
The communication between the filesystem and the direct I/O code requires
a new buffer head flag, which is a bit ugly but not avoidable until the
direct I/O code stops abusing the buffer_head structure for communicating
with the filesystems.
Currently this creates a per-superblock unbound workqueue for these
completions, which is taken from an earlier patch by Jan Kara. I'm
not really convinced about this use and would prefer a "normal" global
workqueue with a high concurrency limit, but this needs further discussion.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/direct-io.c | 50 ++++++++++++++++++++++++++++++---------------
fs/ext4/ext4.h | 11 ----------
fs/ext4/inode.c | 28 +++++++------------------
fs/ext4/page-io.c | 30 +++++++--------------------
fs/ext4/super.c | 16 ---------------
fs/ocfs2/aops.c | 8 +-------
fs/super.c | 28 +++++++++++++++++--------
fs/xfs/xfs_aops.c | 28 +++++--------------------
fs/xfs/xfs_aops.h | 3 ---
include/linux/buffer_head.h | 2 ++
include/linux/fs.h | 7 +++++--
11 files changed, 81 insertions(+), 130 deletions(-)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5..36bddad 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ bool defer_completion; /* defer AIO completion to workqueue? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
* allocation time. Don't add new fields after pages[] unless you
* wish that they not be zeroed.
*/
- struct page *pages[DIO_PAGES]; /* page buffer */
+ union {
+ struct page *pages[DIO_PAGES]; /* page buffer */
+ struct work_struct complete_work;/* deferred AIO completion */
+ };
} ____cacheline_aligned_in_smp;
static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
*
- * This releases locks as dictated by the locking type, lets interested parties
- * know that a DIO operation has completed, and calculates the resulting return
- * code for the operation.
+ * This drops i_dio_count, lets interested parties know that a DIO operation
+ * has completed, and calculates the resulting return code for the operation.
*
* It lets the filesystem know if it registered an interest earlier via
* get_block. Pass the private field of the map buffer_head so that
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+ bool is_async)
{
ssize_t transferred = 0;
@@ -258,19 +262,26 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result) {
- dio->end_io(dio->iocb, offset, transferred,
- dio->private, ret, is_async);
- } else {
- inode_dio_done(dio->inode);
- if (is_async)
- aio_complete(dio->iocb, ret, 0);
- }
+ if (dio->end_io && dio->result)
+ dio->end_io(dio->iocb, offset, transferred, dio->private);
+
+ inode_dio_done(dio->inode);
+ if (is_async)
+ aio_complete(dio->iocb, ret, 0);
+ kmem_cache_free(dio_cache, dio);
return ret;
}
+static void dio_aio_complete_work(struct work_struct *work)
+{
+ struct dio *dio = container_of(work, struct dio, complete_work);
+
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
+
static int dio_bio_complete(struct dio *dio, struct bio *bio);
+
/*
* Asynchronous IO callback.
*/
@@ -290,8 +301,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kmem_cache_free(dio_cache, dio);
+ if (dio->result && dio->defer_completion) {
+ INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+ queue_work(dio->inode->i_sb->s_dio_done_wq,
+ &dio->complete_work);
+ } else {
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ }
}
}
@@ -581,6 +597,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
/* Store for completion */
dio->private = map_bh->b_private;
+
+ if (buffer_defer_completion(map_bh))
+ dio->defer_completion = true;
}
return ret;
}
@@ -1269,7 +1288,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (drop_refcount(dio) == 0) {
retval = dio_complete(dio, offset, retval, false);
- kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b577e45..bb64697 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -180,7 +180,6 @@ struct ext4_map_blocks {
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_DIRECT 0x0002
/*
* For converting uninitialized extents on a work queue. 'handle' is used for
@@ -196,8 +195,6 @@ typedef struct ext4_io_end {
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t;
@@ -900,11 +897,9 @@ struct ext4_inode_info {
* Completed IOs that need unwritten extents handling and don't have
* transaction reserved
*/
- struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
- struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1276,8 +1271,6 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for unreserved extent convertions (dio) */
- struct workqueue_struct *unrsv_conversion_wq;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
@@ -1340,9 +1333,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- /* Writeback has to have coversion transaction reserved */
- WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
- !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -2715,7 +2705,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
-extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0188e65..56faf56 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -730,8 +730,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -2997,19 +3001,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
- if (!io_end) {
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
+ if (!io_end)
return;
- }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3019,11 +3017,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- ext4_put_io_end_defer(io_end);
+ ext4_put_io_end(io_end);
}
/*
@@ -3108,7 +3102,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ret = -ENOMEM;
goto retake_lock;
}
- io_end->flag |= EXT4_IO_END_DIRECT;
/*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
@@ -3153,13 +3146,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- WARN_ON(io_end->iocb);
- /*
- * Generic code already did inode_dio_done() so we
- * have to clear EXT4_IO_END_DIRECT to not do it for
- * the second time.
- */
- io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 48786cd..d25fab4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -122,10 +122,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
- if (io_end->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(io_end->inode);
- if (io_end->iocb)
- aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -203,19 +199,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io_end->handle) {
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
- if (list_empty(&ei->i_rsv_conversion_list))
- queue_work(wq, &ei->i_rsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
- } else {
- wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
- if (list_empty(&ei->i_unrsv_conversion_list))
- queue_work(wq, &ei->i_unrsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
- }
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
@@ -255,13 +246,6 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-void ext4_end_io_unrsv_work(struct work_struct *work)
-{
- struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unrsv_conversion_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-}
-
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 85b3dd6..5ab89a0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->unrsv_conversion_wq);
flush_workqueue(sbi->rsv_conversion_wq);
- destroy_workqueue(sbi->unrsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
@@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
- INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
- INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -3960,14 +3956,6 @@ no_journal:
goto failed_mount4;
}
- EXT4_SB(sb)->unrsv_conversion_wq =
- alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->unrsv_conversion_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
- goto failed_mount4;
- }
-
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -4121,8 +4109,6 @@ failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- if (EXT4_SB(sb)->unrsv_conversion_wq)
- destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -4570,7 +4556,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
- flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
@@ -4606,7 +4591,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
dquot_writeback_dquots(sb, -1);
if (wait && test_opt(sb, BARRIER))
ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 79736a2..a533593 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -565,9 +565,7 @@ bail:
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
@@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
-
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
}
/*
diff --git a/fs/super.c b/fs/super.c
index 7465d43..4992012 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -152,15 +152,21 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
static const struct super_operations default_op;
if (s) {
- if (security_sb_alloc(s)) {
- /*
- * We cannot call security_sb_free() without
- * security_sb_alloc() succeeding. So bail out manually
- */
- kfree(s);
- s = NULL;
- goto out;
+ /*
+ * Alloc workqueue only for filesystems that need it. This also
+ * avoids problems with pseudo filesystems which get initialized
+ * before workqueues can be created
+ */
+ if (type->fs_flags & FS_REQUIRES_DEV) {
+ s->s_dio_done_wq =
+ alloc_workqueue("dio-sync", WQ_UNBOUND, 1);
+ if (!s->s_dio_done_wq)
+ goto out_free_sb;
}
+
+ if (security_sb_alloc(s))
+ goto out_destroy_wq;
+
#ifdef CONFIG_SMP
s->s_files = alloc_percpu(struct list_head);
if (!s->s_files)
@@ -228,6 +234,10 @@ err_out:
free_percpu(s->s_files);
#endif
destroy_sb_writers(s);
+out_destroy_wq:
+ if (s->s_dio_done_wq)
+ destroy_workqueue(s->s_dio_done_wq);
+out_free_sb:
kfree(s);
s = NULL;
goto out;
@@ -244,6 +254,8 @@ static inline void destroy_super(struct super_block *s)
#ifdef CONFIG_SMP
free_percpu(s->s_files);
#endif
+ if (s->s_dio_done_wq)
+ destroy_workqueue(s->s_dio_done_wq);
destroy_sb_writers(s);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 596ec71..e11d654 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,14 +86,6 @@ xfs_destroy_ioend(
bh->b_end_io(bh, !ioend->io_error);
}
- if (ioend->io_iocb) {
- inode_dio_done(ioend->io_inode);
- if (ioend->io_isasync) {
- aio_complete(ioend->io_iocb, ioend->io_error ?
- ioend->io_error : ioend->io_result, 0);
- }
- }
-
mempool_free(ioend, xfs_ioend_pool);
}
@@ -281,7 +273,6 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
- ioend->io_isasync = 0;
ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
@@ -291,8 +282,6 @@ xfs_alloc_ioend(
ioend->io_buffer_tail = NULL;
ioend->io_offset = 0;
ioend->io_size = 0;
- ioend->io_iocb = NULL;
- ioend->io_result = 0;
ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -1292,8 +1281,10 @@ __xfs_get_blocks(
if (create || !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (create && ISUNWRITTEN(&imap)) {
- if (direct)
+ if (direct) {
bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
set_buffer_unwritten(bh_result);
}
}
@@ -1390,9 +1381,7 @@ xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct xfs_ioend *ioend = iocb->private;
@@ -1414,17 +1403,10 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = XFS_IO_UNWRITTEN;
- if (is_async) {
- ioend->io_isasync = 1;
- xfs_finish_ioend(ioend);
- } else {
- xfs_finish_ioend_sync(ioend);
- }
+ xfs_finish_ioend_sync(ioend);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb..f94dd45 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- unsigned int io_isasync : 1; /* needs aio_complete */
unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
- struct kiocb *io_iocb;
- int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 91fa9a9..d77797a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -36,6 +36,7 @@ enum bh_state_bits {
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
+ BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -128,6 +129,7 @@ BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
+BUFFER_FNS(Defer_Completion, defer_completion)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 99be011..5b3fa72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -45,6 +45,7 @@ struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
+struct workqueue_struct;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -62,8 +63,7 @@ struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
- ssize_t bytes, void *private, int ret,
- bool is_async);
+ ssize_t bytes, void *private);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
@@ -1325,6 +1325,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
};
/* superblock cache pruning functions */
--
1.8.1.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO
2013-07-10 22:02 [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
@ 2013-07-10 22:02 ` Jan Kara
0 siblings, 0 replies; 7+ messages in thread
From: Jan Kara @ 2013-07-10 22:02 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-ext4, xfs, hch, Jeff Moyer, Al Viro, Christoph Hellwig,
Jan Kara
From: Christoph Hellwig <hch@infradead.org>
Call generic_write_sync() from the deferred I/O completion handler if
O_DSYNC is set for a write request. Also make sure various callers
don't call generic_write_sync if the direct I/O code returns
-EIOCBQUEUED.
Based on an earlier patch from Jan Kara <jack@suse.cz> with updates from
Jeff Moyer <jmoyer@redhat.com> and Darrick J. Wong <darrick.wong@oracle.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/block_dev.c | 2 +-
fs/btrfs/file.c | 2 +-
fs/cifs/file.c | 2 +-
fs/direct-io.c | 21 ++++++++++++++++++++-
fs/ext4/file.c | 2 +-
mm/filemap.c | 2 +-
6 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bb43ce0..a884054 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1512,7 +1512,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a..6eb61da 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1610,7 +1610,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
- if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 91d8629..275ab19 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2530,7 +2530,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
mutex_unlock(&inode->i_mutex);
}
- if (rc > 0 || rc == -EIOCBQUEUED) {
+ if (rc > 0) {
ssize_t err;
err = generic_write_sync(file, pos, rc);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 36bddad..fe8562e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -266,8 +266,18 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
dio->end_io(dio->iocb, offset, transferred, dio->private);
inode_dio_done(dio->inode);
- if (is_async)
+ if (is_async) {
+ if (dio->rw & WRITE) {
+ int err;
+
+ err = generic_write_sync(dio->iocb->ki_filp, offset,
+ transferred);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+
aio_complete(dio->iocb, ret, 0);
+ }
kmem_cache_free(dio_cache, dio);
return ret;
@@ -1161,6 +1171,15 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
+ /*
+ * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+ * so that we can call ->fsync.
+ */
+ if (dio->is_async && (rw & WRITE) &&
+ ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host)))
+ dio->defer_completion = true;
+
retval = 0;
dio->inode = inode;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6f4cc56..bb9efc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe7..928572e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2550,7 +2550,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
--
1.8.1.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2013-09-04 13:04 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-14 9:10 [PATCH 0/2 v2] Fix O_SYNC AIO DIO Jan Kara
2013-08-14 9:10 ` [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
2013-08-14 9:10 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
2013-08-30 15:53 ` [PATCH 0/2 v2] Fix O_SYNC AIO DIO Al Viro
2013-09-04 10:54 ` Jan Kara
-- strict thread matches above, loose matches on Subject: below --
2013-09-04 13:04 [PATCH 0/2 v3] " Jan Kara
2013-09-04 13:04 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
2013-07-10 22:02 [PATCH 1/2] direct-io: Implement generic deferred AIO completions Jan Kara
2013-07-10 22:02 ` [PATCH 2/2] direct-io: Handle O_(D)SYNC AIO Jan Kara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).