linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch|rfc][0/3] fix aio+dio+O_SYNC writes
@ 2012-01-27 21:15 Jeff Moyer
  2012-01-27 21:15 ` [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Jeff Moyer @ 2012-01-27 21:15 UTC (permalink / raw)
  To: linux-ext4, xfs, linux-fsdevel

Currently, if an AIO/DIO/O_SYNC write is issued, generic_write_sync is
called after the submission of the I/O instead of after the completion.
This can result in flushing the disk cache before the actual write even
gets there!  This patch set attempts to fix that for xfs and ext4.
Comments would be greatly appreciated.

Cheers,
Jeff

[PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
[PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-01-27 21:15 [patch|rfc][0/3] fix aio+dio+O_SYNC writes Jeff Moyer
@ 2012-01-27 21:15 ` Jeff Moyer
  2012-01-28 14:59   ` Christoph Hellwig
  2012-01-27 21:15 ` [PATCH 2/3] ext4: " Jeff Moyer
  2012-01-27 21:15 ` [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
  2 siblings, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-01-27 21:15 UTC (permalink / raw)
  To: linux-ext4, xfs, linux-fsdevel; +Cc: Jeff Moyer

Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/xfs/xfs_aops.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_aops.h |    1 +
 fs/xfs/xfs_buf.c  |    9 +++++++
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee..909e020 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -158,6 +158,48 @@ xfs_setfilesize(
 }
 
 /*
+ * In the case of synchronous, AIO, O_DIRECT writes, we need to flush
+ * the disk cache when the I/O is complete.
+ */
+STATIC bool
+xfs_ioend_needs_cache_flush(
+	struct xfs_ioend	*ioend)
+{
+	if (!ioend->io_isasync)
+		return false;
+
+	return (IS_SYNC(ioend->io_inode) ||
+		(ioend->io_iocb->ki_filp->f_flags & O_DSYNC));
+}
+
+STATIC void
+xfs_end_io_flush(
+	struct bio	*bio,
+	int		error)
+{
+	struct xfs_ioend *ioend = bio->bi_private;
+
+	if (error && ioend->io_result > 0)
+		ioend->io_result = error;
+
+	xfs_destroy_ioend(ioend);
+	bio_put(bio);
+}
+
+STATIC void
+xfs_ioend_flush_cache(
+	struct xfs_ioend	*ioend)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 0);
+	bio->bi_end_io = xfs_end_io_flush;
+	bio->bi_bdev = xfs_find_bdev_for_inode(ioend->io_inode);
+	bio->bi_private = ioend;
+	submit_bio(WRITE_FLUSH, bio);
+}
+
+/*
  * Schedule IO completion handling on the final put of an ioend.
  *
  * If there is no work to do we might as well call it a day and free the
@@ -172,6 +214,8 @@ xfs_finish_ioend(
 			queue_work(xfsconvertd_workqueue, &ioend->io_work);
 		else if (xfs_ioend_is_append(ioend))
 			queue_work(xfsdatad_workqueue, &ioend->io_work);
+		else if (xfs_ioend_needs_cache_flush(ioend))
+			queue_work(xfsflushd_workqueue, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
 	}
@@ -226,9 +270,30 @@ done:
 		xfs_finish_ioend(ioend);
 		/* ensure we don't spin on blocked ioends */
 		delay(1);
-	} else {
+	} else if (xfs_ioend_needs_cache_flush(ioend)) {
+		struct xfs_inode *ip = XFS_I(ioend->io_inode);
+		struct xfs_mount *mp = ip->i_mount;
+		int	err;
+		int	log_flushed = 0;
+
+		/*
+		 * Check to see if we only need to sync data.  If so,
+		 * we can skip the log flush.
+		 */
+		if (IS_SYNC(ioend->io_inode) ||
+		    (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {
+			err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+			if (err && ioend->io_result > 0)
+				ioend->io_result = err;
+			if (err || log_flushed) {
+				xfs_destroy_ioend(ioend);
+				return;
+			}
+		}
+		/* log not flushed or data sync only, flush the disk cache */
+		xfs_ioend_flush_cache(ioend);
+	} else
 		xfs_destroy_ioend(ioend);
-	}
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c..3f4a1c4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -20,6 +20,7 @@
 
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern struct workqueue_struct *xfsconvertd_workqueue;
+extern struct workqueue_struct *xfsflushd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c..39980a8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 struct workqueue_struct *xfsconvertd_workqueue;
+struct workqueue_struct *xfsflushd_workqueue;
 
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
@@ -1802,8 +1803,15 @@ xfs_buf_init(void)
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
+	xfsflushd_workqueue = alloc_workqueue("xfsflushd",
+					      WQ_MEM_RECLAIM, 1);
+	if (!xfsflushd_workqueue)
+		goto out_destroy_xfsconvertd_workqueue;
+
 	return 0;
 
+ out_destroy_xfsconvertd_workqueue:
+	destroy_workqueue(xfsconvertd_workqueue);
  out_destroy_xfsdatad_workqueue:
 	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
@@ -1817,6 +1825,7 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
+	destroy_workqueue(xfsflushd_workqueue);
 	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-01-27 21:15 [patch|rfc][0/3] fix aio+dio+O_SYNC writes Jeff Moyer
  2012-01-27 21:15 ` [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
@ 2012-01-27 21:15 ` Jeff Moyer
  2012-02-02 17:31   ` Jan Kara
  2012-01-27 21:15 ` [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
  2 siblings, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-01-27 21:15 UTC (permalink / raw)
  To: linux-ext4, xfs, linux-fsdevel; +Cc: Jeff Moyer

Hi,

If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
flushed after the write completion.  Instead, it's flushed *before* the
I/O is sent to the disk (in __generic_file_aio_write).  This patch
attempts to fix that problem by marking an I/O as requiring a cache
flush in endio processing.  I'll send a follow-on patch to the
generic write code to get rid of the bogus generic_write_sync call
when EIOCBQUEUED is returned.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/ext4/ext4.h    |    4 ++++
 fs/ext4/inode.c   |   11 +++++++++--
 fs/ext4/page-io.c |   39 ++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c   |   11 +++++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2d55d7c..4377ed3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -185,6 +185,7 @@ struct mpage_da_data {
 #define EXT4_IO_END_ERROR	0x0002
 #define EXT4_IO_END_QUEUED	0x0004
 #define EXT4_IO_END_DIRECT	0x0008
+#define EXT4_IO_END_NEEDS_SYNC	0x0010
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -1247,6 +1248,9 @@ struct ext4_sb_info {
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
 
+	/* workqueue for aio+dio+o_sync disk cache flushing */
+	struct workqueue_struct *aio_dio_flush_wq;
+
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f6dc02b..13cdb4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2769,8 +2769,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	iocb->private = NULL;
 
+	/* AIO+DIO+O_SYNC I/Os need a cache flush after completion */
+	if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC)))
+		io_end->flag |= EXT4_IO_END_NEEDS_SYNC;
+
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+	if (!(io_end->flag & (EXT4_IO_END_UNWRITTEN|EXT4_IO_END_NEEDS_SYNC))) {
 		ext4_free_io_end(io_end);
 out:
 		if (is_async)
@@ -2785,7 +2789,10 @@ out:
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+		wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+	else
+		wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq;
 
 	/* Add the io_end to per-inode completed aio dio list*/
 	ei = EXT4_I(io_end->inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9e1b8eb..d07cd40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -98,15 +98,40 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
-	if (ret < 0) {
-		ext4_msg(inode->i_sb, KERN_EMERG,
-			 "failed to convert unwritten extents to written "
-			 "extents -- potential data loss!  "
-			 "(inode %lu, offset %llu, size %zd, error %d)",
-			 inode->i_ino, offset, size, ret);
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+		if (ret < 0) {
+			ext4_msg(inode->i_sb, KERN_EMERG,
+				 "failed to convert unwritten extents to "
+				 "written extents -- potential data loss!  "
+				 "(inode %lu, offset %llu, size %zd, error %d)",
+				 inode->i_ino, offset, size, ret);
+			goto endio;
+		}
+	}
+
+	/*
+	 * This function has two callers.  The first is the end_io_work
+	 * routine just below.  This is an asynchronous completion context.
+	 * The second is in the fsync path.  For the latter path, we can't
+	 * return from here until the job is done.  Hence, we issue a
+	 * blocking blkdev_issue_flush call.
+	 */
+	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
+		/*
+		 * Ideally, we'd like to know if the force_commit routine
+		 * actually did send something to disk.  If it didn't,
+		 * then we need to issue the cache flush by hand.  For now,
+		 * play it safe and do both.
+		 */
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			goto endio;
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
 	}
 
+endio:
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61f..a24938e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -805,6 +805,7 @@ static void ext4_put_super(struct super_block *sb)
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->aio_dio_flush_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
@@ -3718,6 +3719,13 @@ no_journal:
 		goto failed_mount_wq;
 	}
 
+	EXT4_SB(sb)->aio_dio_flush_wq =
+		alloc_workqueue("ext4-aio-dio-flush", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->aio_dio_flush_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create flush workqueue\n");
+		goto failed_flush_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -3840,6 +3848,8 @@ failed_mount4a:
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq);
+failed_flush_wq:
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
@@ -4303,6 +4313,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->aio_dio_flush_wq);
 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
 			jbd2_log_wait_commit(sbi->s_journal, target);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-01-27 21:15 [patch|rfc][0/3] fix aio+dio+O_SYNC writes Jeff Moyer
  2012-01-27 21:15 ` [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
  2012-01-27 21:15 ` [PATCH 2/3] ext4: " Jeff Moyer
@ 2012-01-27 21:15 ` Jeff Moyer
  2012-01-28 15:08   ` Martin Steigerwald
  2012-02-02 17:52   ` Jan Kara
  2 siblings, 2 replies; 17+ messages in thread
From: Jeff Moyer @ 2012-01-27 21:15 UTC (permalink / raw)
  To: linux-ext4, xfs, linux-fsdevel; +Cc: Jeff Moyer

Hi,

As it stands, generic_file_aio_write will call into generic_write_sync
when -EIOCBQUEUED is returned from __generic_file_aio_write.  EIOCBQUEUED
indicates that an I/O was submitted but NOT completed.  Thus, we will
flush the disk cache, potentially before the write(s) even make it to
the disk!  Up until now, this has been the best we could do, as file
systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
write.  After applying the prior two patches to xfs and ext4, at least
the major two file systems do the right thing.  So, let's go ahead and
fix this backwards logic.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
---
 mm/filemap.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index c4ee2e9..004442f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if (ret > 0 || ret == -EIOCBQUEUED) {
+	if (ret > 0) {
 		ssize_t err;
 
 		err = generic_write_sync(file, pos, ret);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-01-27 21:15 ` [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
@ 2012-01-28 14:59   ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2012-01-28 14:59 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-fsdevel, linux-ext4, xfs

This looks pretty good.  Did this past xfstests?  I'd also like to add
tests actually executing this code path just, to be sure.  E.g. variants
of aio-stress actually using O_SYNC.  We can't easily test data really
made it to disk that way, although at least we make sure the code
doesn't break.

On Fri, Jan 27, 2012 at 04:15:47PM -0500, Jeff Moyer wrote:
> Hi,
> 
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).

XFS doesn't actually use __generic_file_aio_write, so this sentence
isn't correct for XFS.

> +	} else if (xfs_ioend_needs_cache_flush(ioend)) {
> +		struct xfs_inode *ip = XFS_I(ioend->io_inode);
> +		struct xfs_mount *mp = ip->i_mount;
> +		int	err;
> +		int	log_flushed = 0;
> +
> +		/*
> +		 * Check to see if we only need to sync data.  If so,
> +		 * we can skip the log flush.
> +		 */
> +		if (IS_SYNC(ioend->io_inode) ||
> +		    (ioend->io_iocb->ki_filp->f_flags & __O_SYNC)) {

> +			err = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);

Can you add a TODO comment that this actually is synchronous and thus
will block the I/O completion work queue?

Also you can use _xfs_log_force_lsn here as don't need to flush the
whole log, just up to the last lsn that touched the inode.  Copy, or
better factor the code from xfs_dir_fsync for that.

Last but not least this won't catch timestamp updates.  Given that I'm
about to send a series making timestamp updates transaction I would not
recommend you to bother with that, but if you want to take a look
at how xfs_file_fsync deals with them.  Given that this series touches
the same area I'd also like to take your xfs patch in through the xfs tree
to avoid conflicts.

> @@ -47,6 +47,7 @@ STATIC int xfsbufd(void *);
>  static struct workqueue_struct *xfslogd_workqueue;
>  struct workqueue_struct *xfsdatad_workqueue;
>  struct workqueue_struct *xfsconvertd_workqueue;
> +struct workqueue_struct *xfsflushd_workqueue;
>  
>  #ifdef XFS_BUF_LOCK_TRACKING
>  # define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
> @@ -1802,8 +1803,15 @@ xfs_buf_init(void)
>  	if (!xfsconvertd_workqueue)
>  		goto out_destroy_xfsdatad_workqueue;
>  
> +	xfsflushd_workqueue = alloc_workqueue("xfsflushd",
> +					      WQ_MEM_RECLAIM, 1);

This should allow a higher concurrently level, it's probably a good
idea to pass 0 and use the default.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-01-27 21:15 ` [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
@ 2012-01-28 15:08   ` Martin Steigerwald
  2012-02-02 17:52   ` Jan Kara
  1 sibling, 0 replies; 17+ messages in thread
From: Martin Steigerwald @ 2012-01-28 15:08 UTC (permalink / raw)
  To: xfs; +Cc: linux-fsdevel, Jeff Moyer, linux-ext4, linux-btrfs

Adding linux-btrfs to Cc.

Am Freitag, 27. Januar 2012 schrieb Jeff Moyer:
> Hi,

Hi,
 
> As it stands, generic_file_aio_write will call into generic_write_sync
> when -EIOCBQUEUED is returned from __generic_file_aio_write. 
> EIOCBQUEUED indicates that an I/O was submitted but NOT completed. 
> Thus, we will flush the disk cache, potentially before the write(s)
> even make it to the disk!  Up until now, this has been the best we
> could do, as file systems didn't bother to flush the disk cache after
> an O_SYNC AIO+DIO write.  After applying the prior two patches to xfs
> and ext4, at least the major two file systems do the right thing.  So,
> let's go ahead and fix this backwards logic.

Would this need an adaption to BTRFS as well?

Thanks,
Martin

> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  mm/filemap.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index c4ee2e9..004442f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb
> *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb,
> iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex);
> 
> -	if (ret > 0 || ret == -EIOCBQUEUED) {
> +	if (ret > 0) {
>  		ssize_t err;
> 
>  		err = generic_write_sync(file, pos, ret);


-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-01-27 21:15 ` [PATCH 2/3] ext4: " Jeff Moyer
@ 2012-02-02 17:31   ` Jan Kara
  2012-02-06 16:20     ` Jeff Moyer
  2012-02-08 15:11     ` Jeff Moyer
  0 siblings, 2 replies; 17+ messages in thread
From: Jan Kara @ 2012-02-02 17:31 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-ext4, xfs, linux-fsdevel

  Hi,

On Fri 27-01-12 16:15:48, Jeff Moyer wrote:
> If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get
> flushed after the write completion.  Instead, it's flushed *before* the
> I/O is sent to the disk (in __generic_file_aio_write).  This patch
> attempts to fix that problem by marking an I/O as requiring a cache
> flush in endio processing.  I'll send a follow-on patch to the
> generic write code to get rid of the bogus generic_write_sync call
> when EIOCBQUEUED is returned.
  Thanks for the patch!

> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  fs/ext4/ext4.h    |    4 ++++
>  fs/ext4/inode.c   |   11 +++++++++--
>  fs/ext4/page-io.c |   39 ++++++++++++++++++++++++++++++++-------
>  fs/ext4/super.c   |   11 +++++++++++
>  4 files changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 2d55d7c..4377ed3 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -185,6 +185,7 @@ struct mpage_da_data {
>  #define EXT4_IO_END_ERROR	0x0002
>  #define EXT4_IO_END_QUEUED	0x0004
>  #define EXT4_IO_END_DIRECT	0x0008
> +#define EXT4_IO_END_NEEDS_SYNC	0x0010
>  
>  struct ext4_io_page {
>  	struct page	*p_page;
> @@ -1247,6 +1248,9 @@ struct ext4_sb_info {
>  	/* workqueue for dio unwritten */
>  	struct workqueue_struct *dio_unwritten_wq;
>  
> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> +	struct workqueue_struct *aio_dio_flush_wq;
> +
  Hmm, looking at the patch I'm wondering why did you introduce the new
workqueue? It seems dio_unwritten_wq would be enough? You just need to
rename it to something more appropriate ;)

> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 9e1b8eb..d07cd40 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -98,15 +98,40 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>  		   "list->prev 0x%p\n",
>  		   io, inode->i_ino, io->list.next, io->list.prev);
>  
> -	ret = ext4_convert_unwritten_extents(inode, offset, size);
> -	if (ret < 0) {
> -		ext4_msg(inode->i_sb, KERN_EMERG,
> -			 "failed to convert unwritten extents to written "
> -			 "extents -- potential data loss!  "
> -			 "(inode %lu, offset %llu, size %zd, error %d)",
> -			 inode->i_ino, offset, size, ret);
> +	if (io->flag & EXT4_IO_END_UNWRITTEN) {
> +
> +		ret = ext4_convert_unwritten_extents(inode, offset, size);
> +		if (ret < 0) {
> +			ext4_msg(inode->i_sb, KERN_EMERG,
> +				 "failed to convert unwritten extents to "
> +				 "written extents -- potential data loss!  "
> +				 "(inode %lu, offset %llu, size %zd, error %d)",
> +				 inode->i_ino, offset, size, ret);
> +			goto endio;
> +		}
> +	}
> +
> +	/*
> +	 * This function has two callers.  The first is the end_io_work
> +	 * routine just below.  This is an asynchronous completion context.
> +	 * The second is in the fsync path.  For the latter path, we can't
> +	 * return from here until the job is done.  Hence, we issue a
> +	 * blocking blkdev_issue_flush call.
> +	 */
> +	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
> +		/*
> +		 * Ideally, we'd like to know if the force_commit routine
> +		 * actually did send something to disk.  If it didn't,
> +		 * then we need to issue the cache flush by hand.  For now,
> +		 * play it safe and do both.
> +		 */
> +		ret = ext4_force_commit(inode->i_sb);
> +		if (ret)
> +			goto endio;
> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
  Look at what ext4_sync_file() does. It's more efficient than this.
You need something like:
	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
						EXT4_I(inode)->i_datasync_tid;
	if (journal->j_flags & JBD2_BARRIER &&
	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
		needs_barrier = true;
	jbd2_log_start_commit(journal, commit_tid);
	jbd2_log_wait_commit(journal, commit_tid);
	if (needs_barrier)
		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-01-27 21:15 ` [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
  2012-01-28 15:08   ` Martin Steigerwald
@ 2012-02-02 17:52   ` Jan Kara
  2012-02-06 16:33     ` Jeff Moyer
  1 sibling, 1 reply; 17+ messages in thread
From: Jan Kara @ 2012-02-02 17:52 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-ext4, xfs, linux-fsdevel

  Hello,

On Fri 27-01-12 16:15:49, Jeff Moyer wrote:
> As it stands, generic_file_aio_write will call into generic_write_sync
> when -EIOCBQUEUED is returned from __generic_file_aio_write.  EIOCBQUEUED
> indicates that an I/O was submitted but NOT completed.  Thus, we will
> flush the disk cache, potentially before the write(s) even make it to
> the disk!
  Yeah. It seems to be a problem introduced by Tejun's rewrite of barrier
code, right? Before that we'd drain the IO queue when cache flush is issued
and thus effectively wait for IO completion...

>  Up until now, this has been the best we could do, as file
> systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
> write.  After applying the prior two patches to xfs and ext4, at least
> the major two file systems do the right thing.  So, let's go ahead and
> fix this backwards logic.
  But doesn't this break filesystems which you didn't fix explicitely even
more than they were? You are right they might have sent cache flush too
early but they'd at least propely force all metadata modifications (e.g.
from allocation) to disk. But after this patch O_SYNC will have simply no
effect for these filesystems.

Also I was thinking whether we couldn't implement the fix in VFS. Basically
it would be the same like the fix for ext4. Like having a per-sb workqueue
and queue work calling generic_write_sync() from end_io handler when the
file is O_SYNC? That would solve the issue for all filesystems...

								Honza

> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> ---
>  mm/filemap.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index c4ee2e9..004442f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2634,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
>  	mutex_unlock(&inode->i_mutex);
>  
> -	if (ret > 0 || ret == -EIOCBQUEUED) {
> +	if (ret > 0) {
>  		ssize_t err;
>  
>  		err = generic_write_sync(file, pos, ret);
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-02-02 17:31   ` Jan Kara
@ 2012-02-06 16:20     ` Jeff Moyer
  2012-02-06 16:58       ` Jan Kara
  2012-02-08 15:11     ` Jeff Moyer
  1 sibling, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-02-06 16:20 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, xfs, linux-fsdevel

Jan Kara <jack@suse.cz> writes:

>> +	/* workqueue for aio+dio+o_sync disk cache flushing */
>> +	struct workqueue_struct *aio_dio_flush_wq;
>> +
>   Hmm, looking at the patch I'm wondering why did you introduce the new
> workqueue? It seems dio_unwritten_wq would be enough? You just need to
> rename it to something more appropriate ;)

I used a new workqueue as the operations are blocking, and I didn't want
to hold up other progress.  If you think re-using the unwritten_wq is
the right thing to do, I'm happy to comply.

>> +	/*
>> +	 * This function has two callers.  The first is the end_io_work
>> +	 * routine just below.  This is an asynchronous completion context.
>> +	 * The second is in the fsync path.  For the latter path, we can't
>> +	 * return from here until the job is done.  Hence, we issue a
>> +	 * blocking blkdev_issue_flush call.
>> +	 */
>> +	if (io->flag & EXT4_IO_END_NEEDS_SYNC) {
>> +		/*
>> +		 * Ideally, we'd like to know if the force_commit routine
>> +		 * actually did send something to disk.  If it didn't,
>> +		 * then we need to issue the cache flush by hand.  For now,
>> +		 * play it safe and do both.
>> +		 */
>> +		ret = ext4_force_commit(inode->i_sb);
>> +		if (ret)
>> +			goto endio;
>> +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
>   Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> 						EXT4_I(inode)->i_datasync_tid;
> 	if (journal->j_flags & JBD2_BARRIER &&
> 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> 		needs_barrier = true;
> 	jbd2_log_start_commit(journal, commit_tid);
> 	jbd2_log_wait_commit(journal, commit_tid);
> 	if (needs_barrier)
> 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

Great, thanks for the pointer!

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-02-02 17:52   ` Jan Kara
@ 2012-02-06 16:33     ` Jeff Moyer
  2012-02-06 19:55       ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-02-06 16:33 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, xfs, linux-fsdevel

Jan Kara <jack@suse.cz> writes:

>   Hello,
>
> On Fri 27-01-12 16:15:49, Jeff Moyer wrote:
>> As it stands, generic_file_aio_write will call into generic_write_sync
>> when -EIOCBQUEUED is returned from __generic_file_aio_write.  EIOCBQUEUED
>> indicates that an I/O was submitted but NOT completed.  Thus, we will
>> flush the disk cache, potentially before the write(s) even make it to
>> the disk!
>   Yeah. It seems to be a problem introduced by Tejun's rewrite of barrier
> code, right? Before that we'd drain the IO queue when cache flush is issued
> and thus effectively wait for IO completion...

Right, though hch seems to think even then the problem existed.

>>  Up until now, this has been the best we could do, as file
>> systems didn't bother to flush the disk cache after an O_SYNC AIO+DIO
>> write.  After applying the prior two patches to xfs and ext4, at least
>> the major two file systems do the right thing.  So, let's go ahead and
>> fix this backwards logic.
>   But doesn't this break filesystems which you didn't fix explicitely even
> more than they were? You are right they might have sent cache flush too
> early but they'd at least propely force all metadata modifications (e.g.
> from allocation) to disk. But after this patch O_SYNC will have simply no
> effect for these filesystems.

Yep.  Note that we're calling into generic_write_sync with a negative
value.  I followed that call chain all the way down and convinced myself
that it was "mostly harmless," but it sure as heck ain't right.  I'll
audit other file systems to see whether it's a problem.  btrfs, at
least, isn't affected by this.

> Also I was thinking whether we couldn't implement the fix in VFS. Basically
> it would be the same like the fix for ext4. Like having a per-sb workqueue
> and queue work calling generic_write_sync() from end_io handler when the
> file is O_SYNC? That would solve the issue for all filesystems...

Well, that would require buy-in from the other file system developers.
What do the XFS folks think?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-02-06 16:20     ` Jeff Moyer
@ 2012-02-06 16:58       ` Jan Kara
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Kara @ 2012-02-06 16:58 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jan Kara, linux-ext4, xfs, linux-fsdevel

On Mon 06-02-12 11:20:29, Jeff Moyer wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> >> +	/* workqueue for aio+dio+o_sync disk cache flushing */
> >> +	struct workqueue_struct *aio_dio_flush_wq;
> >> +
> >   Hmm, looking at the patch I'm wondering why did you introduce the new
> > workqueue? It seems dio_unwritten_wq would be enough? You just need to
> > rename it to something more appropriate ;)
> 
> I used a new workqueue as the operations are blocking, and I didn't want
> to hold up other progress.  If you think re-using the unwritten_wq is
> the right thing to do, I'm happy to comply.
  Ah, ok. Thinking about it, it's probably better to use a separate work
queue then.

								Honza

-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-02-06 16:33     ` Jeff Moyer
@ 2012-02-06 19:55       ` Christoph Hellwig
  2012-02-07 20:39         ` Jeff Moyer
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2012-02-06 19:55 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jan Kara, linux-fsdevel, linux-ext4, xfs

On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
> > code, right? Before that we'd drain the IO queue when cache flush is issued
> > and thus effectively wait for IO completion...
> 
> Right, though hch seems to think even then the problem existed.

I was wrong, using -o barrier it didn't.  That was however not something
people using O_SYNC heavy production loads would do, they'd use disabled
caches and nobarrier.

> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
> > it would be the same like the fix for ext4. Like having a per-sb workqueue
> > and queue work calling generic_write_sync() from end_io handler when the
> > file is O_SYNC? That would solve the issue for all filesystems...
> 
> Well, that would require buy-in from the other file system developers.
> What do the XFS folks think?

I don't think using that code for XFS makes sene.  But just like
generic_write_sync there's no reason it can't be added to generic code,
just make sure only generic_file_aio_write/__generic_file_aio_write use
it, but generic_file_buffered_write and generic_file_direct_write stay
clear of it.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-02-06 19:55       ` Christoph Hellwig
@ 2012-02-07 20:39         ` Jeff Moyer
  2012-02-08 16:09           ` Jan Kara
  0 siblings, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-02-07 20:39 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jan Kara, linux-fsdevel, linux-ext4, xfs

Christoph Hellwig <hch@infradead.org> writes:

> On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
>> > code, right? Before that we'd drain the IO queue when cache flush is issued
>> > and thus effectively wait for IO completion...
>> 
>> Right, though hch seems to think even then the problem existed.
>
> I was wrong, using -o barrier it didn't.  That was however not something
> people using O_SYNC heavy production loads would do, they'd use disabled
> caches and nobarrier.
>
>> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
>> > it would be the same like the fix for ext4. Like having a per-sb workqueue
>> > and queue work calling generic_write_sync() from end_io handler when the
>> > file is O_SYNC? That would solve the issue for all filesystems...
>> 
>> Well, that would require buy-in from the other file system developers.
>> What do the XFS folks think?
>
> I don't think using that code for XFS makes sene.  But just like
> generic_write_sync there's no reason it can't be added to generic code,
> just make sure only generic_file_aio_write/__generic_file_aio_write use
> it, but generic_file_buffered_write and generic_file_direct_write stay
> clear of it.

ext4_file_write (ext4's .aio_write routine) calls into
generic_file_aio_write.  So, I don't think we can generalize that this
routine means that the file system doesn't install its own endio
handler.  What's more, we'd have to pass an endio routine down the call
stack quite a ways.  In all, I think that would be an uglier solution to
the problem.  Did I miss something?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-02-02 17:31   ` Jan Kara
  2012-02-06 16:20     ` Jeff Moyer
@ 2012-02-08 15:11     ` Jeff Moyer
  2012-02-13 18:27       ` Jan Kara
  1 sibling, 1 reply; 17+ messages in thread
From: Jeff Moyer @ 2012-02-08 15:11 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, xfs, linux-fsdevel

Jan Kara <jack@suse.cz> writes:

>   Look at what ext4_sync_file() does. It's more efficient than this.
> You need something like:
> 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> 						EXT4_I(inode)->i_datasync_tid;
> 	if (journal->j_flags & JBD2_BARRIER &&
> 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> 		needs_barrier = true;
> 	jbd2_log_start_commit(journal, commit_tid);
> 	jbd2_log_wait_commit(journal, commit_tid);
> 	if (needs_barrier)
> 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);

If the transaction won't send a data barrier, wouldn't you want to issue
the flush on the data device prior to commiting the transaction, not
after it?

-Jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-02-07 20:39         ` Jeff Moyer
@ 2012-02-08 16:09           ` Jan Kara
  2012-02-08 16:38             ` Jeff Moyer
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Kara @ 2012-02-08 16:09 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Christoph Hellwig, Jan Kara, linux-fsdevel, linux-ext4, xfs

[-- Attachment #1: Type: text/plain, Size: 2201 bytes --]

On Tue 07-02-12 15:39:06, Jeff Moyer wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> > On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
> >> > code, right? Before that we'd drain the IO queue when cache flush is issued
> >> > and thus effectively wait for IO completion...
> >> 
> >> Right, though hch seems to think even then the problem existed.
> >
> > I was wrong, using -o barrier it didn't.  That was however not something
> > people using O_SYNC heavy production loads would do, they'd use disabled
> > caches and nobarrier.
> >
> >> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
> >> > it would be the same like the fix for ext4. Like having a per-sb workqueue
> >> > and queue work calling generic_write_sync() from end_io handler when the
> >> > file is O_SYNC? That would solve the issue for all filesystems...
> >> 
> >> Well, that would require buy-in from the other file system developers.
> >> What do the XFS folks think?
> >
> > I don't think using that code for XFS makes sene.  But just like
> > generic_write_sync there's no reason it can't be added to generic code,
> > just make sure only generic_file_aio_write/__generic_file_aio_write use
> > it, but generic_file_buffered_write and generic_file_direct_write stay
> > clear of it.
> 
> ext4_file_write (ext4's .aio_write routine) calls into
> generic_file_aio_write.  So, I don't think we can generalize that this
> routine means that the file system doesn't install its own endio
> handler.  What's more, we'd have to pass an endio routine down the call
> stack quite a ways.  In all, I think that would be an uglier solution to
> the problem.  Did I miss something?
  I think it can be done in a relatively elegant way. POC patch (completely
untested) is attached. What do you think? All filesystems using
blockdev_direct_IO() can be easily converted to use this, gfs2 & ocfs2 can
also use the framework. That leaves only ext4, xfs & btrfs which need
special handling. Actually, maybe btrfs could be converted as well because
it doesn't seem to need to offload anything else to workqueue. But I'm not
really sure...

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

[-- Attachment #2: 0001-vfs-Handle-O_SYNC-aio-dio-in-generic-code-properly.patch --]
[-- Type: text/x-patch, Size: 6911 bytes --]

>From be2a2bdc27f86053a6c7db3f2cfb12b6fc987e52 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Feb 2012 16:56:53 +0100
Subject: [PATCH] vfs: Handle O_SYNC aio dio in generic code properly

Provide VFS helpers for handling O_SYNC aio dio writes. Filesystem wanting
to use the helpers has to initialize s_dio_flush_wq and pass DIO_SYNC_WRITES
to __blockdev_direct_IO. Generic code then takes care to call
generic_write_sync() from a workqueue context when aio dio is completed.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/direct-io.c     |   93 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/super.c         |    2 +
 include/linux/fs.h |   11 ++++++
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 4a588db..03c9028 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -112,6 +112,15 @@ struct dio_submit {
 	unsigned tail;			/* last valid page + 1 */
 };
 
+/* state needed for final sync and completion of O_SYNC AIO DIO */
+struct dio_sync_io_work {
+	struct kiocb *iocb;
+	loff_t offset;
+	ssize_t len;
+	int ret;
+	struct work_struct work;
+};
+
 /* dio_state communicated between submission path and end_io */
 struct dio {
 	int flags;			/* doesn't change */
@@ -134,6 +143,7 @@ struct dio {
 	/* AIO related stuff */
 	struct kiocb *iocb;		/* kiocb */
 	ssize_t result;                 /* IO result */
+	struct dio_sync_io_work *sync_work;	/* work used for O_SYNC AIO */
 
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
@@ -261,6 +271,44 @@ static inline struct page *dio_get_page(struct dio *dio,
 }
 
 /**
+ * generic_dio_end_io() - generic dio ->end_io handler
+ * @iocb: iocb of finishing DIO
+ * @offset: the byte offset in the file of the completed operation
+ * @bytes: length of the completed operation
+ * @work: work to queue for O_SYNC AIO DIO, NULL otherwise
+ * @ret: error code if IO failed
+ * @is_async: is this AIO?
+ *
+ * This is generic callback to be called when direct IO is finished. It
+ * handles update of number of outstanding DIOs for an inode, completion
+ * of async iocb and queueing of work if we need to call fsync() because
+ * io was O_SYNC.
+ */
+void generic_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes,
+			struct dio_sync_io_work *work, int ret, bool is_async)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	if (!is_async) {
+		inode_dio_done(inode);
+		return;
+	}
+
+	/*
+	 * If we need to sync file, we offload completion to workqueue
+	 */
+	if (work) {
+		work->ret = ret;
+		work->offset = offset;
+		work->len = bytes;
+		queue_work(inode->i_sb->s_dio_flush_wq, &work->work);
+	} else {
+		aio_complete(iocb, ret, 0);
+		inode_dio_done(inode);
+	}
+}
+
+/**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
  *
@@ -305,9 +353,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->private, ret, is_async);
 	} else {
-		if (is_async)
-			aio_complete(dio->iocb, ret, 0);
-		inode_dio_done(dio->inode);
+		/* No IO submitted? Skip syncing... */
+		if (!dio->result && dio->sync_work) {
+			kfree(dio->sync_work);
+			dio->sync_work = NULL;
+		}
+		generic_dio_end_io(dio->iocb, offset, transferred,
+				   dio->sync_work, ret, is_async);
 	}
 
 	return ret;
@@ -1064,6 +1116,25 @@ static inline int drop_refcount(struct dio *dio)
 }
 
 /*
+ * Work performed from workqueue when AIO DIO is finished.
+ */
+static void dio_aio_sync_work(struct work_struct *work)
+{
+	struct dio_sync_io_work *sync_work =
+			container_of(work, struct dio_sync_io_work, work);
+	struct kiocb *iocb = sync_work->iocb;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int err, ret = sync_work->ret;
+
+	err = generic_write_sync(iocb->ki_filp, sync_work->offset,
+				 sync_work->len);
+	if (err < 0 && ret > 0)
+		ret = err;
+	aio_complete(iocb, ret, 0);
+	inode_dio_done(inode);
+}
+
+/*
  * This is a library function for use by filesystem drivers.
  *
  * The locking rules are governed by the flags parameter:
@@ -1155,6 +1226,18 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	memset(dio, 0, offsetof(struct dio, pages));
 
 	dio->flags = flags;
+	if (flags & DIO_SYNC_WRITES && rw & WRITE &&
+	    ((iocb->ki_filp->f_flags & O_DSYNC) || IS_SYNC(inode))) {
+		dio->sync_work = kmalloc(sizeof(struct dio_sync_io_work),
+					 GFP_KERNEL);
+		if (!dio->sync_work) {
+			retval = -ENOMEM;
+			kmem_cache_free(dio_cache, dio);
+			goto out;
+		}
+		INIT_WORK(&dio->sync_work->work, dio_aio_sync_work);
+		dio->sync_work->iocb = iocb;
+	}
 	if (dio->flags & DIO_LOCKING) {
 		if (rw == READ) {
 			struct address_space *mapping =
@@ -1167,6 +1250,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 							      end - 1);
 			if (retval) {
 				mutex_unlock(&inode->i_mutex);
+				kfree(dio->sync_work);
 				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
@@ -1310,6 +1394,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 	if (drop_refcount(dio) == 0) {
 		retval = dio_complete(dio, offset, retval, false);
+		/* Test for !NULL to save a call for common case */
+		if (dio->sync_work)
+			kfree(dio->sync_work);
 		kmem_cache_free(dio_cache, dio);
 	} else
 		BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/super.c b/fs/super.c
index 6015c02..741784d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -200,6 +200,8 @@ static inline void destroy_super(struct super_block *s)
 #ifdef CONFIG_SMP
 	free_percpu(s->s_files);
 #endif
+	if (s->s_dio_flush_wq)
+		destroy_workqueue(s->s_dio_flush_wq);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
 	kfree(s->s_subtype);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 386da09..910843e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1496,6 +1496,9 @@ struct super_block {
 
 	/* Being remounted read-only */
 	int s_readonly_remount;
+
+	/* Pending fsync calls for completed AIO DIO with O_SYNC */
+	struct workqueue_struct	*s_dio_flush_wq;
 };
 
 /* superblock cache pruning functions */
@@ -2428,12 +2431,20 @@ enum {
 
 	/* filesystem does not support filling holes */
 	DIO_SKIP_HOLES	= 0x02,
+
+	/* need generic handling of O_SYNC aio writes */
+	DIO_SYNC_WRITES = 0x04
 };
 
 void dio_end_io(struct bio *bio, int error);
 void inode_dio_wait(struct inode *inode);
 void inode_dio_done(struct inode *inode);
 
+static inline struct workqueue_struct *alloc_dio_sync_wq(void)
+{
+	return alloc_workqueue("dio-sync", WQ_UNBOUND, 1);
+}
+
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED
  2012-02-08 16:09           ` Jan Kara
@ 2012-02-08 16:38             ` Jeff Moyer
  0 siblings, 0 replies; 17+ messages in thread
From: Jeff Moyer @ 2012-02-08 16:38 UTC (permalink / raw)
  To: Jan Kara; +Cc: Christoph Hellwig, linux-fsdevel, linux-ext4, xfs

Jan Kara <jack@suse.cz> writes:

> On Tue 07-02-12 15:39:06, Jeff Moyer wrote:
>> Christoph Hellwig <hch@infradead.org> writes:
>> > On Mon, Feb 06, 2012 at 11:33:29AM -0500, Jeff Moyer wrote:
>> >> > code, right? Before that we'd drain the IO queue when cache flush is issued
>> >> > and thus effectively wait for IO completion...
>> >> 
>> >> Right, though hch seems to think even then the problem existed.
>> >
>> > I was wrong, using -o barrier it didn't.  That was however not something
>> > people using O_SYNC heavy production loads would do, they'd use disabled
>> > caches and nobarrier.
>> >
>> >> > Also I was thinking whether we couldn't implement the fix in VFS. Basically
>> >> > it would be the same like the fix for ext4. Like having a per-sb workqueue
>> >> > and queue work calling generic_write_sync() from end_io handler when the
>> >> > file is O_SYNC? That would solve the issue for all filesystems...
>> >> 
>> >> Well, that would require buy-in from the other file system developers.
>> >> What do the XFS folks think?
>> >
>> > I don't think using that code for XFS makes sene.  But just like
>> > generic_write_sync there's no reason it can't be added to generic code,
>> > just make sure only generic_file_aio_write/__generic_file_aio_write use
>> > it, but generic_file_buffered_write and generic_file_direct_write stay
>> > clear of it.
>> 
>> ext4_file_write (ext4's .aio_write routine) calls into
>> generic_file_aio_write.  So, I don't think we can generalize that this
>> routine means that the file system doesn't install its own endio
>> handler.  What's more, we'd have to pass an endio routine down the call
>> stack quite a ways.  In all, I think that would be an uglier solution to
>> the problem.  Did I miss something?
>   I think it can be done in a relatively elegant way. POC patch (completely
> untested) is attached. What do you think? All filesystems using
> blockdev_direct_IO() can be easily converted to use this, gfs2 & ocfs2 can
> also use the framework. That leaves only ext4, xfs & btrfs which need
> special handling. Actually, maybe btrfs could be converted as well because
> it doesn't seem to need to offload anything else to workqueue. But I'm not
> really sure...

I like it!

-Jeff

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/3] ext4: honor the O_SYNC flag for aysnchronous direct I/O requests
  2012-02-08 15:11     ` Jeff Moyer
@ 2012-02-13 18:27       ` Jan Kara
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Kara @ 2012-02-13 18:27 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Jan Kara, linux-ext4, xfs, linux-fsdevel

On Wed 08-02-12 10:11:47, Jeff Moyer wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> >   Look at what ext4_sync_file() does. It's more efficient than this.
> > You need something like:
> > 	commit_tid = file->f_flags & __O_SYNC ? EXT4_I(inode)->i_sync_tid :
> > 						EXT4_I(inode)->i_datasync_tid;
> > 	if (journal->j_flags & JBD2_BARRIER &&
> > 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
> > 		needs_barrier = true;
> > 	jbd2_log_start_commit(journal, commit_tid);
> > 	jbd2_log_wait_commit(journal, commit_tid);
> > 	if (needs_barrier)
> > 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL);
> 
> If the transaction won't send a data barrier, wouldn't you want to issue
> the flush on the data device prior to commiting the transaction, not
> after it?
  Sorry for late reply. I was thinking about this because the answer isn't
simple... One certain fact is that once ext4_convert_unwritten_extents()
finishes (calls ext4_journal_stop()), the transaction with metadata updates
can commit so whether we place flush before or after jbd2_log_start_commit()
makes no difference. For filesystems where journal is on the filesystem
device, the code should work correctly as is - journalling code will issue
the barrier before transaction commit is done and if there is no
transaction to commit, the place where we issue cache flush does not matter.
But for filesystems where journal is on separate device we indeed need to
issue the flush before the transaction commit is finished so that we
don't expose uninitialized data after crash.

Anyway that's a separate (although related) issue to the one which you fix
in this patch so you can leave the patch as is and I'll fixup the above
problem in a separate patch. Thanks for noticing this!

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2012-02-13 18:27 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-01-27 21:15 [patch|rfc][0/3] fix aio+dio+O_SYNC writes Jeff Moyer
2012-01-27 21:15 ` [PATCH 1/3] xfs: honor the O_SYNC flag for aysnchronous direct I/O requests Jeff Moyer
2012-01-28 14:59   ` Christoph Hellwig
2012-01-27 21:15 ` [PATCH 2/3] ext4: " Jeff Moyer
2012-02-02 17:31   ` Jan Kara
2012-02-06 16:20     ` Jeff Moyer
2012-02-06 16:58       ` Jan Kara
2012-02-08 15:11     ` Jeff Moyer
2012-02-13 18:27       ` Jan Kara
2012-01-27 21:15 ` [PATCH 3/3] filemap: don't call generic_write_sync for -EIOCBQUEUED Jeff Moyer
2012-01-28 15:08   ` Martin Steigerwald
2012-02-02 17:52   ` Jan Kara
2012-02-06 16:33     ` Jeff Moyer
2012-02-06 19:55       ` Christoph Hellwig
2012-02-07 20:39         ` Jeff Moyer
2012-02-08 16:09           ` Jan Kara
2012-02-08 16:38             ` Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).