[PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part
@ 2012-03-02 19:56 Jeff Moyer
  2012-03-02 19:56 ` [PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly Jeff Moyer
                   ` (6 more replies)
  0 siblings, 7 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch

Hi,

Currently, AIO+DIO+O_SYNC writes are not actually sync'd (for xfs), or
they are sync'd before the I/O is actually issued (everybody else).
The following patch series fixes this in two parts.  First, for the
file systems that use the generic routines, Jan has provided some
generic infrastructure to perform the syncs after the I/O is completed.
Second, for those file systems which require some endio processing of
their own for O_DIRECT writes (xfs and ext4), I've implemented file
system specific syncing.  This passes the updated xfs-tests 113 test
I posted earlier, as well as all of the tests in the aio group.  I
tested ext3, ext4, xfs, and btrfs only.

Comments, as always, are appreciated.

Cheers,
Jeff

[PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly
[PATCH 2/7] ocfs2: Use generic handlers of O_SYNC AIO DIO
[PATCH 3/7] gfs2: Use generic handlers of O_SYNC AIO DIO
[PATCH 4/7] btrfs: Use generic handlers of O_SYNC AIO DIO
[PATCH 5/7] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 6/7] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 7/7] filemap: don't call generic_write_sync for -EIOCBQUEUED

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-02 19:56 ` [PATCH 2/7] ocfs2: Use generic handlers of O_SYNC AIO DIO Jeff Moyer
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

From: Jan Kara <jack@suse.cz>

Provide VFS helpers for handling O_SYNC AIO DIO writes. Filesystem wanting to
use the helpers has to pass DIO_SYNC_WRITES to __blockdev_direct_IO. Then if
they don't use direct IO end_io handler, generic code takes care of everything
else. Otherwise their end_io handler is passed struct dio_sync_io_work pointer
as 'private' argument and they have to call generic_dio_end_io() to finish
their AIO DIO. Generic code then takes care to call generic_write_sync() from
a workqueue context when AIO DIO is completed.

Since all filesystems using blockdev_direct_IO() need O_SYNC aio dio handling
and the generic one is enough for them, make blockdev_direct_IO() pass
DIO_SYNC_WRITES flag.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/direct-io.c     |  128 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/super.c         |    2 +
 include/linux/fs.h |   13 +++++-
 3 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab6..da6ef1f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,8 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 
+#include <asm/cmpxchg.h>
+
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
  * the size of a structure in the slab cache
@@ -111,6 +113,15 @@ struct dio_submit {
 	unsigned tail;			/* last valid page + 1 */
 };
 
+/* state needed for final sync and completion of O_SYNC AIO DIO */
+struct dio_sync_io_work {
+	struct kiocb *iocb;
+	loff_t offset;
+	ssize_t len;
+	int ret;
+	struct work_struct work;
+};
+
 /* dio_state communicated between submission path and end_io */
 struct dio {
 	int flags;			/* doesn't change */
@@ -133,6 +144,7 @@ struct dio {
 	/* AIO related stuff */
 	struct kiocb *iocb;		/* kiocb */
 	ssize_t result;                 /* IO result */
+	struct dio_sync_io_work *sync_work;	/* work used for O_SYNC AIO */
 
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
@@ -260,6 +272,45 @@ static inline struct page *dio_get_page(struct dio *dio,
 }
 
 /**
+ * generic_dio_end_io() - generic dio ->end_io handler
+ * @iocb: iocb of finishing DIO
+ * @offset: the byte offset in the file of the completed operation
+ * @bytes: length of the completed operation
+ * @work: work to queue for O_SYNC AIO DIO, NULL otherwise
+ * @ret: error code if IO failed
+ * @is_async: is this AIO?
+ *
+ * This is generic callback to be called when direct IO is finished. It
+ * handles update of number of outstanding DIOs for an inode, completion
+ * of async iocb and queueing of work if we need to call fsync() because
+ * io was O_SYNC.
+ */
+void generic_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes,
+			struct dio_sync_io_work *work, int ret, bool is_async)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	if (!is_async) {
+		inode_dio_done(inode);
+		return;
+	}
+
+	/*
+	 * If we need to sync file, we offload completion to workqueue
+	 */
+	if (work) {
+		work->ret = ret;
+		work->offset = offset;
+		work->len = bytes;
+		queue_work(inode->i_sb->s_dio_flush_wq, &work->work);
+	} else {
+		aio_complete(iocb, ret, 0);
+		inode_dio_done(inode);
+	}
+}
+EXPORT_SYMBOL(generic_dio_end_io);
+
+/**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
  *
@@ -301,12 +352,22 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 		ret = transferred;
 
 	if (dio->end_io && dio->result) {
+		void *private;
+
+		if (dio->sync_work)
+			private = dio->sync_work;
+		else
+			private = dio->private;
 		dio->end_io(dio->iocb, offset, transferred,
-			    dio->private, ret, is_async);
+			    private, ret, is_async);
 	} else {
-		if (is_async)
-			aio_complete(dio->iocb, ret, 0);
-		inode_dio_done(dio->inode);
+		/* No IO submitted? Skip syncing... */
+		if (!dio->result && dio->sync_work) {
+			kfree(dio->sync_work);
+			dio->sync_work = NULL;
+		}
+		generic_dio_end_io(dio->iocb, offset, transferred,
+				   dio->sync_work, ret, is_async);
 	}
 
 	return ret;
@@ -1066,6 +1127,41 @@ static inline int drop_refcount(struct dio *dio)
 }
 
 /*
+ * Work performed from workqueue when AIO DIO is finished.
+ */
+static void dio_aio_sync_work(struct work_struct *work)
+{
+	struct dio_sync_io_work *sync_work =
+			container_of(work, struct dio_sync_io_work, work);
+	struct kiocb *iocb = sync_work->iocb;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int err, ret = sync_work->ret;
+
+	err = generic_write_sync(iocb->ki_filp, sync_work->offset,
+				 sync_work->len);
+	if (err < 0 && ret > 0)
+		ret = err;
+	aio_complete(iocb, ret, 0);
+	inode_dio_done(inode);
+}
+
+static noinline int dio_create_flush_wq(struct super_block *sb)
+{
+	struct workqueue_struct *wq =
+				alloc_workqueue("dio-sync", WQ_UNBOUND, 1);
+
+	if (!wq)
+		return -ENOMEM;
+	/*
+	 * Atomically put workqueue in place. Release our one in case someone
+	 * else won the race and attached workqueue to superblock.
+	 */
+	if (cmpxchg(&sb->s_dio_flush_wq, NULL, wq))
+		destroy_workqueue(wq);
+	return 0;
+}
+
+/*
  * This is a library function for use by filesystem drivers.
  *
  * The locking rules are governed by the flags parameter:
@@ -1154,6 +1250,26 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	memset(dio, 0, offsetof(struct dio, pages));
 
 	dio->flags = flags;
+	if (flags & DIO_SYNC_WRITES && rw & WRITE &&
+	    ((iocb->ki_filp->f_flags & O_DSYNC) || IS_SYNC(inode))) {
+		/* The first O_SYNC AIO DIO for this FS? Create workqueue... */
+		if (!inode->i_sb->s_dio_flush_wq) {
+			retval = dio_create_flush_wq(inode->i_sb);
+			if (retval) {
+				kmem_cache_free(dio_cache, dio);
+				goto out;
+			}
+		}
+		dio->sync_work = kmalloc(sizeof(struct dio_sync_io_work),
+					 GFP_KERNEL);
+		if (!dio->sync_work) {
+			retval = -ENOMEM;
+			kmem_cache_free(dio_cache, dio);
+			goto out;
+		}
+		INIT_WORK(&dio->sync_work->work, dio_aio_sync_work);
+		dio->sync_work->iocb = iocb;
+	}
 	if (dio->flags & DIO_LOCKING) {
 		if (rw == READ) {
 			struct address_space *mapping =
@@ -1166,6 +1282,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 							      end - 1);
 			if (retval) {
 				mutex_unlock(&inode->i_mutex);
+				kfree(dio->sync_work);
 				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
@@ -1309,6 +1426,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 	if (drop_refcount(dio) == 0) {
 		retval = dio_complete(dio, offset, retval, false);
+		/* Test for !NULL to save a call for common case */
+		if (dio->sync_work)
+			kfree(dio->sync_work);
 		kmem_cache_free(dio_cache, dio);
 	} else
 		BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/super.c b/fs/super.c
index de41e1e..d739ec2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -200,6 +200,8 @@ static inline void destroy_super(struct super_block *s)
 #ifdef CONFIG_SMP
 	free_percpu(s->s_files);
 #endif
+	if (s->s_dio_flush_wq)
+		destroy_workqueue(s->s_dio_flush_wq);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7aacf31..42d734d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -410,6 +410,7 @@ struct kstatfs;
 struct vm_area_struct;
 struct vfsmount;
 struct cred;
+struct workqueue_struct;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -1488,6 +1489,9 @@ struct super_block {
 
 	/* Being remounted read-only */
 	int s_readonly_remount;
+
+	/* Pending fsync calls for completed AIO DIO with O_SYNC */
+	struct workqueue_struct	*s_dio_flush_wq;
 };
 
 /* superblock cache pruning functions */
@@ -2420,11 +2424,18 @@ enum {
 
 	/* filesystem does not support filling holes */
 	DIO_SKIP_HOLES	= 0x02,
+
+	/* need generic handling of O_SYNC aio writes */
+	DIO_SYNC_WRITES = 0x04
 };
 
+struct dio_sync_io_work;
+
 void dio_end_io(struct bio *bio, int error);
 void inode_dio_wait(struct inode *inode);
 void inode_dio_done(struct inode *inode);
+void generic_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes,
+                        struct dio_sync_io_work *work, int ret, bool is_async);
 
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
@@ -2437,7 +2448,7 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 {
 	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				    offset, nr_segs, get_block, NULL, NULL,
-				    DIO_LOCKING | DIO_SKIP_HOLES);
+				    DIO_LOCKING | DIO_SKIP_HOLES | DIO_SYNC_WRITES);
 }
 #else
 static inline void inode_dio_wait(struct inode *inode)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/7] ocfs2: Use generic handlers of O_SYNC AIO DIO
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
  2012-03-02 19:56 ` [PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-02 19:56 ` [PATCH 3/7] gfs2: " Jeff Moyer
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

From: Jan Kara <jack@suse.cz>

Use generic handlers to queue fsync() when AIO DIO is completed for O_SYNC
file.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/ocfs2/aops.c |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 78b68af..3d14c2b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -593,9 +593,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	level = ocfs2_iocb_rw_locked_level(iocb);
 	ocfs2_rw_unlock(inode, level);
 
-	if (is_async)
-		aio_complete(iocb, ret, 0);
-	inode_dio_done(inode);
+	generic_dio_end_io(iocb, offset, bytes, private, ret, is_async);
 }
 
 /*
@@ -642,7 +640,7 @@ static ssize_t ocfs2_direct_IO(int rw,
 	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
 				    iov, offset, nr_segs,
 				    ocfs2_direct_IO_get_blocks,
-				    ocfs2_dio_end_io, NULL, 0);
+				    ocfs2_dio_end_io, NULL, DIO_SYNC_WRITES);
 }
 
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/7] gfs2: Use generic handlers of O_SYNC AIO DIO
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
  2012-03-02 19:56 ` [PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly Jeff Moyer
  2012-03-02 19:56 ` [PATCH 2/7] ocfs2: Use generic handlers of O_SYNC AIO DIO Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-02 19:56 ` [PATCH 4/7] btrfs: " Jeff Moyer
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

From: Jan Kara <jack@suse.cz>

Use generic handlers to queue fsync() when AIO DIO is completed for O_SYNC
file.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/gfs2/aops.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 501e5cb..9c381ff 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1034,7 +1034,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, gfs2_get_block_direct,
-				  NULL, NULL, 0);
+				  NULL, NULL, DIO_SYNC_WRITES);
 out:
 	gfs2_glock_dq_m(1, &gh);
 	gfs2_holder_uninit(&gh);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/7] btrfs: Use generic handlers of O_SYNC AIO DIO
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
                   ` (2 preceding siblings ...)
  2012-03-02 19:56 ` [PATCH 3/7] gfs2: " Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-02 19:56 ` [PATCH 5/7] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

From: Jan Kara <jack@suse.cz>

Use generic handlers to queue fsync() when AIO DIO is completed for O_SYNC
file. Although we use our own bio->end_io function, we call dio_end_io()
from it and thus, because we don't set any specific dio->end_io function,
generic code ends up calling generic_dio_end_io() which is all what we need
for proper O_SYNC AIO DIO handling.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/btrfs/inode.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a..c0379e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6219,7 +6219,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	ret = __blockdev_direct_IO(rw, iocb, inode,
 		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
 		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-		   btrfs_submit_direct, 0);
+		   btrfs_submit_direct, DIO_SYNC_WRITES);
 
 	if (ret < 0 && ret != -EIOCBQUEUED) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/7] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
                   ` (3 preceding siblings ...)
  2012-03-02 19:56 ` [PATCH 4/7] btrfs: " Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-02 19:56 ` [PATCH 6/7] ext4: " Jeff Moyer
  2012-03-02 19:56 ` [PATCH 7/7] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion for AIOs.  This patch attempts to fix
that problem by marking an I/O as requiring a cache flush in endio
processing, and then issuing the cache flush after any unwritten extent
conversion is done.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/xfs/xfs_aops.c |  113 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_aops.h |    1 +
 fs/xfs/xfs_buf.c  |    9 ++++
 3 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee..90bed4e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
@@ -158,6 +159,58 @@ xfs_setfilesize(
 }
 
 /*
+ * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
+ * the disk cache when the I/O is complete.
+ */
+STATIC bool
+xfs_ioend_needs_cache_flush(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	struct xfs_mount *mp = ip->i_mount;
+
+	if (!ioend->io_isasync)
+		return false;
+
+	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
+		return false;
+
+	return (IS_SYNC(ioend->io_inode) ||
+		(ioend->io_iocb->ki_filp->f_flags & O_DSYNC));
+}
+
+STATIC void
+xfs_end_io_flush(
+	struct bio	*bio,
+	int		error)
+{
+	struct xfs_ioend *ioend = bio->bi_private;
+
+	if (error && ioend->io_result > 0)
+		ioend->io_result = error;
+
+	xfs_destroy_ioend(ioend);
+	bio_put(bio);
+}
+
+/*
+ * Issue a WRITE_FLUSH to the specified device.
+ */
+STATIC void
+xfs_ioend_flush_cache(
+	struct xfs_ioend	*ioend,
+	xfs_buftarg_t		*targp)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	bio->bi_end_io = xfs_end_io_flush;
+	bio->bi_bdev = targp->bt_bdev;
+	bio->bi_private = ioend;
+	submit_bio(WRITE_FLUSH, bio);
+}
+
+/*
  * Schedule IO completion handling on the final put of an ioend.
  *
  * If there is no work to do we might as well call it a day and free the
@@ -172,11 +225,61 @@ xfs_finish_ioend(
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
 		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else if (xfs_ioend_needs_cache_flush(ioend))
+			queue_work(xfsflushd_workqueue, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
 	}
 }
 
+STATIC void
+xfs_ioend_force_cache_flush(
+	xfs_ioend_t	*ioend)
+{
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	struct xfs_mount *mp = ip->i_mount;
+	xfs_lsn_t	lsn = 0;
+	int		err = 0;
+	int		log_flushed = 0;
+
+	/*
+	 * Check to see if we need to sync metadata.  If so,
+	 * perform a log flush.  If not, just flush the disk
+	 * write cache for the data disk.
+	 */
+	if (IS_SYNC(ioend->io_inode) ||
+	    (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {
+		/*
+		 * TODO: xfs_blkdev_issue_flush and _xfs_log_force_lsn
+		 * are synchronous, and so will block the I/O
+		 * completion work queue.
+		 */
+		/*
+		 * If the log device is different from the data device,
+		 * be sure to flush the cache on the data device
+		 * first.
+		 */
+		if (mp->m_logdev_targp != mp->m_ddev_targp)
+			xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		if (xfs_ipincount(ip))
+			lsn = ip->i_itemp->ili_last_lsn;
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		if (lsn)
+			err = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC,
+						 &log_flushed);
+		if (err && ioend->io_result > 0)
+			ioend->io_result = err;
+		if (err || log_flushed)
+			xfs_destroy_ioend(ioend);
+		else
+			xfs_ioend_flush_cache(ioend, mp->m_logdev_targp);
+	} else
+		/* data sync only, flush the disk cache */
+		xfs_ioend_flush_cache(ioend, mp->m_ddev_targp);
+}
+
 /*
  * IO write completion.
  */
@@ -218,17 +321,19 @@ xfs_end_io(
 done:
 	/*
 	 * If we didn't complete processing of the ioend, requeue it to the
-	 * tail of the workqueue for another attempt later. Otherwise destroy
-	 * it.
+	 * tail of the workqueue for another attempt later. Otherwise, see
+	 * if we need to perform a disk write cache flush.  If not, destroy
+	 * the ioend.
 	 */
 	if (error == EAGAIN) {
 		atomic_inc(&ioend->io_remaining);
 		xfs_finish_ioend(ioend);
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
-	} else {
+	} else if (xfs_ioend_needs_cache_flush(ioend))
+		xfs_ioend_force_cache_flush(ioend);
+	else
 		xfs_destroy_ioend(ioend);
-	}
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c..3f4a1c4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -20,6 +20,7 @@
 
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern struct workqueue_struct *xfsconvertd_workqueue;
+extern struct workqueue_struct *xfsflushd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c..fcc20e1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 struct workqueue_struct *xfsconvertd_workqueue;
+struct workqueue_struct *xfsflushd_workqueue;
 
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
@@ -1802,8 +1803,15 @@ xfs_buf_init(void)
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
+	xfsflushd_workqueue = alloc_workqueue("xfsflushd",
+					      WQ_MEM_RECLAIM, 0);
+	if (!xfsflushd_workqueue)
+		goto out_destroy_xfsconvertd_workqueue;
+
 	return 0;
 
+ out_destroy_xfsconvertd_workqueue:
+	destroy_workqueue(xfsconvertd_workqueue);
  out_destroy_xfsdatad_workqueue:
 	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
@@ -1817,6 +1825,7 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
+	destroy_workqueue(xfsflushd_workqueue);
 	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 6/7] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
                   ` (4 preceding siblings ...)
  2012-03-02 19:56 ` [PATCH 5/7] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  2012-03-05 10:09   ` Jan Kara
  2012-03-02 19:56 ` [PATCH 7/7] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
  6 siblings, 1 reply; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/ext4/ext4.h    |    4 +++
 fs/ext4/inode.c   |   11 ++++++-
 fs/ext4/page-io.c |   73 +++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/super.c   |   11 ++++++++
 4 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3ce6a0c..e1478be 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,6 +186,7 @@ struct mpage_da_data {
 #define EXT4_IO_END_QUEUED	0x0004
 #define EXT4_IO_END_DIRECT	0x0008
 #define EXT4_IO_END_IN_FSYNC	0x0010
+#define EXT4_IO_END_NEEDS_SYNC	0x0020
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -1248,6 +1249,9 @@ struct ext4_sb_info {
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
 
+	/* workqueue for aio+dio+o_sync disk cache flushing */
+	struct workqueue_struct *aio_dio_flush_wq;
+
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f6dc02b..13cdb4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2769,8 +2769,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	iocb->private = NULL;
 
+	/* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
+	if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
+		io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
+
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+	if (!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_END_NEEDS_SYNC))) {
 		ext4_free_io_end(io_end);
 out:
 		if (is_async)
@@ -2785,7 +2789,10 @@ out:
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	else
+		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
 
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef1..baa1b84 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -82,6 +82,50 @@ void ext4_free_io_end(ext4_io_end_t *io)
 }
 
 /*
+ * This function is called in the completion path for AIO O_SYNC|O_DIRECT
+ * writes, and also in the fsync path.  The purpose is to ensure that the
+ * disk caches for the journal and data devices are flushed.
+ */
+static int ext4_end_io_do_flush(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	tid_t commit_tid;
+	bool needs_barrier = false;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	int barriers_enabled = test_opt(inode->i_sb, BARRIER);
+	int ret = 0;
+
+	if (!barriers_enabled)
+		return 0;
+
+	/*
+	 * If we are running in nojournal mode, just flush the disk
+	 * cache and return.
+	 */
+	if (!journal)
+		return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
+
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	commit_tid = io->iocb->ki_filp->f_flags & __O_SYNC ?
+		EXT4_I(inode)->i_sync_tid : EXT4_I(inode)->i_datasync_tid;
+	if (!jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+
+	jbd2_log_start_commit(journal, commit_tid);
+	ret = jbd2_log_wait_commit(journal, commit_tid);
+
+	if (!ret && needs_barrier)
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
+
+out:
+	return ret;
+}
+
+/*
  * check a range of space and convert unwritten extents to written.
  *
  * Called with inode->i_mutex; we depend on this when we manipulate
@@ -98,15 +142,30 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
-			 "failed to convert unwritten extents to written "
-			 "extents -- potential data loss!  "
-			 "(inode %lu, offset %llu, size %zd, error %d)",
-			 inode->i_ino, offset, size, ret);
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+		if (ret < 0) {
+			ext4_msg(inode->i_sb, KERN_EMERG,
+				 "failed to convert unwritten extents to "
+				 "written extents -- potential data loss!  "
+				 "(inode %lu, offset %llu, size %zd, error %d)",
+				 inode->i_ino, offset, size, ret);
+			goto endio;
+		}
 	}
 
+	/*
+	 * This function has two callers.  The first is the end_io_work
+	 * routine just below, which is an asynchronous completion context.
+	 * The second is in the fsync path.  For the latter path, we can't
+	 * return from here until the job is done.  Hence,
+	 * ext4_end_io_do_flush is blocking.
+	 */
+	if (io->flag & EXT4_IO_END_NEEDS_SYNC)
+		ret = ext4_end_io_do_flush(io);
+
+endio:
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..a24938e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -805,6 +805,7 @@ static void ext4_put_super(struct super_block *sb)
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->aio_dio_flush_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
@@ -3718,6 +3719,13 @@ no_journal:
 		goto failed_mount_wq;
 	}
 
+	EXT4_SB(sb)->aio_dio_flush_wq =
+		alloc_workqueue("ext4-aio-dio-flush", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->aio_dio_flush_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create flush workqueue\n");
+		goto failed_flush_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -3840,6 +3848,8 @@ failed_mount4a:
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
+failed_flush_wq:
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
@@ -4303,6 +4313,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->aio_dio_flush_wq);
 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
 			jbd2_log_wait_commit(sbi->s_journal, target);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 7/7] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
                   ` (5 preceding siblings ...)
  2012-03-02 19:56 ` [PATCH 6/7] ext4: " Jeff Moyer
@ 2012-03-02 19:56 ` Jeff Moyer
  6 siblings, 0 replies; 9+ messages in thread
From: Jeff Moyer @ 2012-03-02 19:56 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, xfs; +Cc: jack, hch, Jeff Moyer

Hi,

As it stands, generic_file_aio_write will call into generic_write_sync
when -EIOCBQUEUED is returned from __generic_file_aio_write.  EIOCBQUEUED
indicates that an I/O was submitted but NOT completed.  Thus, we will
flush the disk cache, potentially before the write(s) even make it to
the disk!  Up until now, this has been the best we could do, as file
systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
write.  After applying the prior two patches to xfs and ext4, at least
the major two file systems do the right thing.  So, let's go ahead and
fix this backwards logic.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 mm/filemap.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index c4ee2e9..004442f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);

-	if (ret > 0 || ret == -EIOCBQUEUED) {
+	if (ret > 0) {
 		ssize_t err;

 		err = generic_write_sync(file, pos, ret);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 6/7] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-03-02 19:56 ` [PATCH 6/7] ext4: " Jeff Moyer
@ 2012-03-05 10:09   ` Jan Kara
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Kara @ 2012-03-05 10:09 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-fsdevel, linux-ext4, xfs, jack, hch

On Fri 02-03-12 14:56:14, Jeff Moyer wrote:
> Hi,
> 
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).  This patch
> attempts to fix that problem by marking an I/O as requiring a cache
> flush in endio processing.  I'll send a follow-on patch to the
> generic write code to get rid of the bogus generic_write_sync call
> when EIOCBQUEUED is returned.
  Looks good. You can add:
Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
> 
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  fs/ext4/ext4.h    |    4 +++
>  fs/ext4/inode.c   |   11 ++++++-
>  fs/ext4/page-io.c |   73 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/ext4/super.c   |   11 ++++++++
>  4 files changed, 90 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3ce6a0c..e1478be 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -186,6 +186,7 @@ struct mpage_da_data {
>  #define EXT4_IO_END_QUEUED	0x0004
>  #define EXT4_IO_END_DIRECT	0x0008
>  #define EXT4_IO_END_IN_FSYNC	0x0010
> +#define EXT4_IO_END_NEEDS_SYNC	0x0020
>  
>  struct ext4_io_page {
>  	struct page	*p_page;
> @@ -1248,6 +1249,9 @@ struct ext4_sb_info {
>  	/* workqueue for dio unwritten */
>  	struct workqueue_struct *dio_unwritten_wq;
>  
> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> +	struct workqueue_struct *aio_dio_flush_wq;
> +
>  	/* timer for periodic error stats printing */
>  	struct timer_list s_err_report;
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f6dc02b..13cdb4c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2769,8 +2769,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
>  
>  	iocb->private = NULL;
>  
> +	/* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
> +	if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
> +		io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
> +
>  	/* if not aio dio with unwritten extents, just free io and return */
> -	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> +	if (!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_END_NEEDS_SYNC))) {
>  		ext4_free_io_end(io_end);
>  out:
>  		if (is_async)
> @@ -2785,7 +2789,10 @@ out:
>  		io_end->iocb = iocb;
>  		io_end->result = ret;
>  	}
> -	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
> +		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
> +	else
> +		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
>  
>  	/* Add the io_end to per-inode completed aio dio list*/
>  	ei = EXT4_I(io_end->inode);
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dcdeef1..baa1b84 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -82,6 +82,50 @@ void ext4_free_io_end(ext4_io_end_t *io)
>  }
>  
>  /*
> + * This function is called in the completion path for AIO O_SYNC|O_DIRECT
> + * writes, and also in the fsync path.  The purpose is to ensure that the
> + * disk caches for the journal and data devices are flushed.
> + */
> +static int ext4_end_io_do_flush(ext4_io_end_t *io)
> +{
> +	struct inode *inode = io->inode;
> +	tid_t commit_tid;
> +	bool needs_barrier = false;
> +	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +	int barriers_enabled = test_opt(inode->i_sb, BARRIER);
> +	int ret = 0;
> +
> +	if (!barriers_enabled)
> +		return 0;
> +
> +	/*
> +	 * If we are running in nojournal mode, just flush the disk
> +	 * cache and return.
> +	 */
> +	if (!journal)
> +		return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> +
> +	if (ext4_should_journal_data(inode)) {
> +		ret = ext4_force_commit(inode->i_sb);
> +		goto out;
> +	}
> +
> +	commit_tid = io->iocb->ki_filp->f_flags & __O_SYNC ?
> +		EXT4_I(inode)->i_sync_tid : EXT4_I(inode)->i_datasync_tid;
> +	if (!jbd2_trans_will_send_data_barrier(journal, commit_tid))
> +		needs_barrier = true;
> +
> +	jbd2_log_start_commit(journal, commit_tid);
> +	ret = jbd2_log_wait_commit(journal, commit_tid);
> +
> +	if (!ret && needs_barrier)
> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> +
> +out:
> +	return ret;
> +}
> +
> +/*
>   * check a range of space and convert unwritten extents to written.
>   *
>   * Called with inode->i_mutex; we depend on this when we manipulate
> @@ -98,15 +142,30 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
>  
> -	ret = ext4_convert_unwritten_extents(inode, offset, size);
> -	if (ret < 0) {
> -		ext4_msg(inode->i_sb, KERN_EMERG,
> -			 "failed to convert unwritten extents to written "
> -			 "extents -- potential data loss!  "
> -			 "(inode %lu, offset %llu, size %zd, error %d)",
> -			 inode->i_ino, offset, size, ret);
> +	if (io->flag & EXT4_IO_END_UNWRITTEN) {
> +
> +		ret = ext4_convert_unwritten_extents(inode, offset, size);
> +		if (ret < 0) {
> +			ext4_msg(inode->i_sb, KERN_EMERG,
> +				 "failed to convert unwritten extents to "
> +				 "written extents -- potential data loss!  "
> +				 "(inode %lu, offset %llu, size %zd, error %d)",
> +				 inode->i_ino, offset, size, ret);
> +			goto endio;
> +		}
>  	}
>  
> +	/*
> +	 * This function has two callers.  The first is the end_io_work
> +	 * routine just below, which is an asynchronous completion context.
> +	 * The second is in the fsync path.  For the latter path, we can't
> +	 * return from here until the job is done.  Hence,
> +	 * ext4_end_io_do_flush is blocking.
> +	 */
> +	if (io->flag & EXT4_IO_END_NEEDS_SYNC)
> +		ret = ext4_end_io_do_flush(io);
> +
> +endio:
>  	if (io->iocb)
>  		aio_complete(io->iocb, io->result, 0);
>  
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 502c61f..a24938e 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -805,6 +805,7 @@ static void ext4_put_super(struct super_block *sb)
>  	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
>  
>  	flush_workqueue(sbi->dio_unwritten_wq);
> +	destroy_workqueue(sbi->aio_dio_flush_wq);
>  	destroy_workqueue(sbi->dio_unwritten_wq);
>  
>  	lock_super(sb);
> @@ -3718,6 +3719,13 @@ no_journal:
>  		goto failed_mount_wq;
>  	}
>  
> +	EXT4_SB(sb)->aio_dio_flush_wq =
> +		alloc_workqueue("ext4-aio-dio-flush", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
> +	if (!EXT4_SB(sb)->aio_dio_flush_wq) {
> +		printk(KERN_ERR "EXT4-fs: failed to create flush workqueue\n");
> +		goto failed_flush_wq;
> +	}
> +
>  	/*
>  	 * The jbd2_journal_load will have done any necessary log recovery,
>  	 * so we can safely mount the rest of the filesystem now.
> @@ -3840,6 +3848,8 @@ failed_mount4a:
>  	sb->s_root = NULL;
>  failed_mount4:
>  	ext4_msg(sb, KERN_ERR, "mount failed");
> +	destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
> +failed_flush_wq:
>  	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
>  failed_mount_wq:
>  	if (sbi->s_journal) {
> @@ -4303,6 +4313,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
>  
>  	trace_ext4_sync_fs(sb, wait);
>  	flush_workqueue(sbi->dio_unwritten_wq);
> +	flush_workqueue(sbi->aio_dio_flush_wq);
>  	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
>  		if (wait)
>  			jbd2_log_wait_commit(sbi->s_journal, target);
> -- 
> 1.7.1
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2012-03-05 10:09 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-02 19:56 [PATCH 0/7, v2] fs: fix up AIO+DIO+O_SYNC to actually do the sync part Jeff Moyer
2012-03-02 19:56 ` [PATCH 1/7] vfs: Handle O_SYNC AIO DIO in generic code properly Jeff Moyer
2012-03-02 19:56 ` [PATCH 2/7] ocfs2: Use generic handlers of O_SYNC AIO DIO Jeff Moyer
2012-03-02 19:56 ` [PATCH 3/7] gfs2: " Jeff Moyer
2012-03-02 19:56 ` [PATCH 4/7] btrfs: " Jeff Moyer
2012-03-02 19:56 ` [PATCH 5/7] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
2012-03-02 19:56 ` [PATCH 6/7] ext4: " Jeff Moyer
2012-03-05 10:09   ` Jan Kara
2012-03-02 19:56 ` [PATCH 7/7] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).