linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:26   ` Christoph Hellwig
  2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML
  Cc: hch, Jan Kara, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

Introduce new function for generic inode syncing (generic_sync_file) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c        |   22 ++++----------
 fs/sync.c          |   81 +++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |    7 ++++
 mm/filemap.c       |   16 ++++------
 4 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 73766d2..8190237 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba6..fc320aa 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -176,23 +176,30 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * generic_sync_file - helper to sync data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @what:		what should be synced
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * What the function exactly does is controlled by the @what parameter:
  *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
+ * If SYNC_SUBMIT_DATA is set, the function submits all pages in the given
+ * range to disk.
+ *
+ * The function always calls ->fsync() callback of the filesystem. If
+ * SYNC_INODE is not set, we pass down the fact that it is just a datasync.
+ *
+ * If SYNC_WAIT_DATA is set, the function waits for writeback to finish
+ * in the given range.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int generic_sync_file(struct file *file, struct dentry *dentry, loff_t start,
+		      loff_t end, int what)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
-	int err, ret;
+	int err, ret = 0;
 
 	/*
 	 * Get mapping and operations from the file in case we have
@@ -212,23 +219,50 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	if (what & SYNC_SUBMIT_DATA)
+		ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = fop->fsync(file, dentry, datasync);
+	err = fop->fsync(file, dentry, !(what & SYNC_INODE));
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
+
+	if (what & SYNC_WAIT_DATA) {
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (!ret)
+			ret = err;
+	}
 out:
 	return ret;
 }
+EXPORT_SYMBOL(generic_sync_file);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int what = SYNC_SUBMIT_DATA | SYNC_WAIT_DATA;
+
+	if (!datasync)
+		what |= SYNC_INODE;
+	return generic_sync_file(file, dentry, 0, LLONG_MAX, what);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +288,25 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ * FIXME: Make it inline?
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return generic_sync_file(file, file->f_path.dentry, pos,
+				 pos + count - 1,
+				 SYNC_SUBMIT_DATA | SYNC_WAIT_DATA);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1edd159..0f13049 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2088,7 +2088,14 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 
+/* Flags for generic_sync_file */
+#define SYNC_INODE		1
+#define SYNC_SUBMIT_DATA	2
+#define SYNC_WAIT_DATA		4
+extern int generic_sync_file(struct file *file, struct dentry *dentry,
+			   loff_t start, loff_t end, int what);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index dffdc60..a29b35b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -2452,18 +2451,16 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
 		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
@@ -2486,8 +2483,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
@@ -2496,10 +2492,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 08/17] ext2: Update comment about generic_osync_inode
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
  2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
  3 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4

We rely on generic_write_sync() now.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e271303..1c1638f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
 		unlock_buffer(bh);
 		mark_buffer_dirty_inode(bh, inode);
 		/* We used to sync bh here if IS_SYNC(inode).
-		 * But we now rely upon generic_osync_inode()
+		 * But we now rely upon generic_write_sync()
 		 * and b_inode_buffers.  But not for directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
  2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
  3 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c |   61 +-------------------------------------------------------
 1 files changed, 1 insertions(+), 60 deletions(-)

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704..51fee5f 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
                   ` (2 preceding siblings ...)
  2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  3 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4, tytso

The syncing is now properly handled by generic_file_aio_write() so
no special ext4 code is needed.

CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c |   53 ++---------------------------------------------------
 1 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873f..aafe432 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext4_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
@ 2009-08-19 16:26   ` Christoph Hellwig
  2009-08-20 12:15     ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-19 16:26 UTC (permalink / raw)
  To: Jan Kara
  Cc: LKML, hch, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

Looks good to me.  Eventually we should use those SYNC_ flags also all
through the fsync codepath, but I'll see if I can incorporate that in my
planned fsync rewrite.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-19 16:26   ` Christoph Hellwig
@ 2009-08-20 12:15     ` Jan Kara
  2009-08-20 16:27       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2009-08-20 12:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> Looks good to me.  Eventually we should use those SYNC_ flags also all
> through the fsync codepath, but I'll see if I can incorporate that in my
> planned fsync rewrite.
  Yes, I thought I'll leave that for later. BTW it should be fairly easy to
teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
filesystem sets some flag in inode->i_mapping (or somewhere else) as is
needed for XFS, btrfs, etc.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 12:15     ` Jan Kara
@ 2009-08-20 16:27       ` Christoph Hellwig
  2009-08-21 15:23         ` Jan Kara
  2009-08-26 18:22         ` Christoph Hellwig
  0 siblings, 2 replies; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-20 16:27 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Thu, Aug 20, 2009 at 02:15:31PM +0200, Jan Kara wrote:
> On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> > Looks good to me.  Eventually we should use those SYNC_ flags also all
> > through the fsync codepath, but I'll see if I can incorporate that in my
> > planned fsync rewrite.
>   Yes, I thought I'll leave that for later. BTW it should be fairly easy to
> teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
> filesystem sets some flag in inode->i_mapping (or somewhere else) as is
> needed for XFS, btrfs, etc.

Maybe you can help brain storming, but I still can't see any way in that
the

  - write data
  - write inode
  - wait for data

actually is a benefit in terms of semantics (I agree that it could be
faster in theory, but even that is debatable with todays seek latencies
in disks)

Think about a simple non-journaling filesystem like ext2:

 (1) block get allocated during ->write before putting data in
      - this dirties the inode because we update i_block/i_size/etc
 (2) we call fsync (or the O_SNC handling code for that matter)
      - we start writeout of the data, which takes forever because the
	file is very large
      - then we write out the inode, including the i_size/i_blocks
	update
      - due to some reason this gets reordered before the data writeout
	finishes (without that happening there would be no benefit to
	this ordering anyway)
 (3) no we call filemap_fdatawait to wait for data I/O to finish


Now the system crashes between (2) and (3).  After that we we do have
stale data in the inode in the area not written yet.

Is there some case between that simple filesystem and the i_size update
from I/O completion handler in XFS/ext4 where this behaviour actually
buys us anything?  Any ext3 magic maybe?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 16:27       ` Christoph Hellwig
@ 2009-08-21 15:23         ` Jan Kara
  2009-08-21 15:32           ` Christoph Hellwig
  2009-08-26 18:22         ` Christoph Hellwig
  1 sibling, 1 reply; 18+ messages in thread
From: Jan Kara @ 2009-08-21 15:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Thu 20-08-09 12:27:29, Christoph Hellwig wrote:
> On Thu, Aug 20, 2009 at 02:15:31PM +0200, Jan Kara wrote:
> > On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> > > Looks good to me.  Eventually we should use those SYNC_ flags also all
> > > through the fsync codepath, but I'll see if I can incorporate that in my
> > > planned fsync rewrite.
> >   Yes, I thought I'll leave that for later. BTW it should be fairly easy to
> > teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
> > filesystem sets some flag in inode->i_mapping (or somewhere else) as is
> > needed for XFS, btrfs, etc.
> 
> Maybe you can help brain storming, but I still can't see any way in that
> the
> 
>   - write data
>   - write inode
>   - wait for data
> 
> actually is a benefit in terms of semantics (I agree that it could be
> faster in theory, but even that is debatable with todays seek latencies
> in disks)
> 
> Think about a simple non-journaling filesystem like ext2:
> 
>  (1) block get allocated during ->write before putting data in
>       - this dirties the inode because we update i_block/i_size/etc
>  (2) we call fsync (or the O_SNC handling code for that matter)
>       - we start writeout of the data, which takes forever because the
> 	file is very large
>       - then we write out the inode, including the i_size/i_blocks
> 	update
>       - due to some reason this gets reordered before the data writeout
> 	finishes (without that happening there would be no benefit to
> 	this ordering anyway)
>  (3) no we call filemap_fdatawait to wait for data I/O to finish
> 
> Now the system crashes between (2) and (3).  After that we we do have
> stale data in the inode in the area not written yet.
  Yes, that's true.

> Is there some case between that simple filesystem and the i_size update
> from I/O completion handler in XFS/ext4 where this behaviour actually
> buys us anything?  Any ext3 magic maybe?
  Hmm, I can imagine it would buy us something in two cases (but looking at
the code, neither is implemented in such a way that it would really help
us in any way):
1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
   this.
2) when we journal data
  In the first case we would wait for block with data to be written only to
submit it again because inode was still dirty.
  In the second case, it would make sence if we waited for transaction
commit in fdatawait() because only then data is really on disk. But I don't
know about a fs which would do it - ext3 in data=journal mode just adds
page buffers to the current transaction in writepage and never sets
PageWriteback so fdatawait() is nop for it. The page is pinned in memory
only by the fact that its buffer heads are part of a transaction and thus
cannot be freed.
  So currently I don't know about real cases where fdatawait after ->fsync()
would buy us anything...

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-21 15:23         ` Jan Kara
@ 2009-08-21 15:32           ` Christoph Hellwig
  2009-08-21 15:48             ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-21 15:32 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Fri, Aug 21, 2009 at 05:23:39PM +0200, Jan Kara wrote:
>   Hmm, I can imagine it would buy us something in two cases (but looking at
> the code, neither is implemented in such a way that it would really help
> us in any way):
> 1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
>    this.

>   In the first case we would wait for block with data to be written only to
> submit it again because inode was still dirty.

But we would not actually store in the pagecache or at least never
writeback it via the pacache in that case, but always just write it as
part of the inode.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-21 15:32           ` Christoph Hellwig
@ 2009-08-21 15:48             ` Jan Kara
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2009-08-21 15:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Fri 21-08-09 11:32:35, Christoph Hellwig wrote:
> On Fri, Aug 21, 2009 at 05:23:39PM +0200, Jan Kara wrote:
> >   Hmm, I can imagine it would buy us something in two cases (but looking at
> > the code, neither is implemented in such a way that it would really help
> > us in any way):
> > 1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
> >    this.
> 
> >   In the first case we would wait for block with data to be written only to
> > submit it again because inode was still dirty.
> 
> But we would not actually store in the pagecache or at least never
> writeback it via the pacache in that case, but always just write it as
> part of the inode.
  Yes, probably. Actually, that's what UDF does. It's just that then the
sequence fdatawrite(), fdatawait() does not really send the data to disk.
But that probably does not matter.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
       [not found] <1250874001-15483-1-git-send-email-jack@suse.cz>
@ 2009-08-21 16:59 ` Jan Kara
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2009-08-21 16:59 UTC (permalink / raw)
  To: LKML
  Cc: hch, linux-fsdevel, Jan Kara, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

Introduce new function for generic inode syncing (generic_sync_file) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions. This makes
O_SYNC writes to block devices acquire i_mutex for syncing. If we really
care about this, we can make block_fsync() drop the i_mutex and reacquire
it before it returns.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c        |   22 ++++----------
 fs/sync.c          |   81 +++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |    7 ++++
 mm/filemap.c       |   18 ++++--------
 4 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 73766d2..8190237 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba6..fc320aa 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -176,23 +176,30 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * generic_sync_file - helper to sync data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @what:		what should be synced
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * What the function exactly does is controlled by the @what parameter:
  *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
+ * If SYNC_SUBMIT_DATA is set, the function submits all pages in the given
+ * range to disk.
+ *
+ * The function always calls ->fsync() callback of the filesystem. If
+ * SYNC_INODE is not set, we pass down the fact that it is just a datasync.
+ *
+ * If SYNC_WAIT_DATA is set, the function waits for writeback to finish
+ * in the given range.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int generic_sync_file(struct file *file, struct dentry *dentry, loff_t start,
+		      loff_t end, int what)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
-	int err, ret;
+	int err, ret = 0;
 
 	/*
 	 * Get mapping and operations from the file in case we have
@@ -212,23 +219,50 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	if (what & SYNC_SUBMIT_DATA)
+		ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = fop->fsync(file, dentry, datasync);
+	err = fop->fsync(file, dentry, !(what & SYNC_INODE));
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
+
+	if (what & SYNC_WAIT_DATA) {
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (!ret)
+			ret = err;
+	}
 out:
 	return ret;
 }
+EXPORT_SYMBOL(generic_sync_file);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int what = SYNC_SUBMIT_DATA | SYNC_WAIT_DATA;
+
+	if (!datasync)
+		what |= SYNC_INODE;
+	return generic_sync_file(file, dentry, 0, LLONG_MAX, what);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +288,25 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ * FIXME: Make it inline?
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return generic_sync_file(file, file->f_path.dentry, pos,
+				 pos + count - 1,
+				 SYNC_SUBMIT_DATA | SYNC_WAIT_DATA);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29fc8da..648001c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2088,7 +2088,14 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 
+/* Flags for generic_sync_file */
+#define SYNC_INODE		1
+#define SYNC_SUBMIT_DATA	2
+#define SYNC_WAIT_DATA		4
+extern int generic_sync_file(struct file *file, struct dentry *dentry,
+			   loff_t start, loff_t end, int what);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index ef9f635..d0802a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -2480,19 +2479,16 @@ ssize_t device_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
-		err = sync_page_range_nolock(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
@@ -2515,8 +2511,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
@@ -2525,11 +2520,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
       [not found] <1250875447-15622-1-git-send-email-jack@suse.cz>
@ 2009-08-21 17:23 ` Jan Kara
  2009-08-27 17:35   ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2009-08-21 17:23 UTC (permalink / raw)
  To: LKML
  Cc: hch, linux-fsdevel, Jan Kara, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

Introduce new function for generic inode syncing (generic_sync_file) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions. This makes
O_SYNC writes to block devices acquire i_mutex for syncing. If we really
care about this, we can make block_fsync() drop the i_mutex and reacquire
it before it returns.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c        |   22 ++++----------
 fs/sync.c          |   81 +++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |    7 ++++
 mm/filemap.c       |   18 ++++--------
 4 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 73766d2..8190237 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba6..fc320aa 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -176,23 +176,30 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * generic_sync_file - helper to sync data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @what:		what should be synced
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * What the function exactly does is controlled by the @what parameter:
  *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
+ * If SYNC_SUBMIT_DATA is set, the function submits all pages in the given
+ * range to disk.
+ *
+ * The function always calls ->fsync() callback of the filesystem. If
+ * SYNC_INODE is not set, we pass down the fact that it is just a datasync.
+ *
+ * If SYNC_WAIT_DATA is set, the function waits for writeback to finish
+ * in the given range.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int generic_sync_file(struct file *file, struct dentry *dentry, loff_t start,
+		      loff_t end, int what)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
-	int err, ret;
+	int err, ret = 0;
 
 	/*
 	 * Get mapping and operations from the file in case we have
@@ -212,23 +219,50 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	if (what & SYNC_SUBMIT_DATA)
+		ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = fop->fsync(file, dentry, datasync);
+	err = fop->fsync(file, dentry, !(what & SYNC_INODE));
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
+
+	if (what & SYNC_WAIT_DATA) {
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (!ret)
+			ret = err;
+	}
 out:
 	return ret;
 }
+EXPORT_SYMBOL(generic_sync_file);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int what = SYNC_SUBMIT_DATA | SYNC_WAIT_DATA;
+
+	if (!datasync)
+		what |= SYNC_INODE;
+	return generic_sync_file(file, dentry, 0, LLONG_MAX, what);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +288,25 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ * FIXME: Make it inline?
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return generic_sync_file(file, file->f_path.dentry, pos,
+				 pos + count - 1,
+				 SYNC_SUBMIT_DATA | SYNC_WAIT_DATA);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29fc8da..648001c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2088,7 +2088,14 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 
+/* Flags for generic_sync_file */
+#define SYNC_INODE		1
+#define SYNC_SUBMIT_DATA	2
+#define SYNC_WAIT_DATA		4
+extern int generic_sync_file(struct file *file, struct dentry *dentry,
+			   loff_t start, loff_t end, int what);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3955f7e..70988a1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -2480,19 +2479,16 @@ ssize_t device_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
-		err = sync_page_range_nolock(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
@@ -2515,8 +2511,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
@@ -2525,11 +2520,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 16:27       ` Christoph Hellwig
  2009-08-21 15:23         ` Jan Kara
@ 2009-08-26 18:22         ` Christoph Hellwig
  2009-08-27  0:04           ` Christoph Hellwig
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-26 18:22 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Thu, Aug 20, 2009 at 12:27:29PM -0400, Christoph Hellwig wrote:
> Maybe you can help brain storming, but I still can't see any way in that
> the
> 
>   - write data
>   - write inode
>   - wait for data
> 
> actually is a benefit in terms of semantics (I agree that it could be
> faster in theory, but even that is debatable with todays seek latencies
> in disks)

Btw, another reason why our current default is actively harmful:

	barriers

With volatile write caches we do have to flush the disk write cache in
->fsync, either implicitly by a metadata operation, or explicitly if
only data changed.  Unless the filesystems waits itself for the data
to hit the disk like XFS or btrfs will be issue the cache flush
potentially before the data write has actually reached the disk cache.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-26 18:22         ` Christoph Hellwig
@ 2009-08-27  0:04           ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-27  0:04 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Wed, Aug 26, 2009 at 02:22:36PM -0400, Christoph Hellwig wrote:
> On Thu, Aug 20, 2009 at 12:27:29PM -0400, Christoph Hellwig wrote:
> > Maybe you can help brain storming, but I still can't see any way in that
> > the
> > 
> >   - write data
> >   - write inode
> >   - wait for data
> > 
> > actually is a benefit in terms of semantics (I agree that it could be
> > faster in theory, but even that is debatable with todays seek latencies
> > in disks)
> 
> Btw, another reason why our current default is actively harmful:
> 
> 	barriers
> 
> With volatile write caches we do have to flush the disk write cache in
> ->fsync, either implicitly by a metadata operation, or explicitly if
> only data changed.  Unless the filesystems waits itself for the data
> to hit the disk like XFS or btrfs will be issue the cache flush
> potentially before the data write has actually reached the disk cache.

Ok, this one failed the reality check - no matter how hard I tried
I could not reproduce that case in my test harness.  It turns out
cache flush request are quite sensibly treated as barriers by the block
layer and thus we drain the queue before issueing the cache flush.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-21 17:23 ` Jan Kara
@ 2009-08-27 17:35   ` Christoph Hellwig
  2009-08-30 16:35     ` Jamie Lokier
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-27 17:35 UTC (permalink / raw)
  To: Jan Kara
  Cc: LKML, hch, linux-fsdevel, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

> +int generic_write_sync(struct file *file, loff_t pos, loff_t count)
> +{
> +	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
> +		return 0;
> +	return generic_sync_file(file, file->f_path.dentry, pos,
> +				 pos + count - 1,
> +				 SYNC_SUBMIT_DATA | SYNC_WAIT_DATA);
> +}
> +EXPORT_SYMBOL(generic_write_sync);

>  
> +/* Flags for generic_sync_file */
> +#define SYNC_INODE		1
> +#define SYNC_SUBMIT_DATA	2
> +#define SYNC_WAIT_DATA		4

When I think about this more I really hate the latter two flags.
There's really no reason to just do only either the submit or wait.
I'd say kill the flags for now and just implement generic_write_sync
as:

int generic_write_sync(struct file *file, loff_t pos, loff_t count)
{
	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
		return 0;
	return vfs_fsync_range(file, file->f_path.dentry, pos,
			       pos + count - 1, 1);
}

and we can look into replacing the datasync flag with something more
meaningfull later through the whole fsync stack.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-27 17:35   ` Christoph Hellwig
@ 2009-08-30 16:35     ` Jamie Lokier
  2009-08-30 16:39       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Jamie Lokier @ 2009-08-30 16:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, hch, linux-fsdevel, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

Christoph Hellwig wrote:
> int generic_write_sync(struct file *file, loff_t pos, loff_t count)
> {
> 	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
> 		return 0;
> 	return vfs_fsync_range(file, file->f_path.dentry, pos,
> 			       pos + count - 1, 1);
> }
> 
> and we can look into replacing the datasync flag with something more
> meaningfull later through the whole fsync stack.

I like that.  It looks really clear and self-documenting, if
vfs_fsync_range does what it sounds like, which is a nice change.

If I've guessed right what that code does, proper O_RSYNC will be easy:

int generic_sync_before_read(struct file *file, loff_t pos, loff_t count)
{
	int is_sync = ((file->f_flags & O_SYNC)
		       || IS_SYNC(file->f_mapping->host));
	int is_dsync = is_sync || (file->f_flags & O_DSYNC);

	if (!is_dsync || !(file->f_flags & O_RSYNC))
		return 0;
	return vfs_fsync_range(file, file->f_ath.denty, pos,
			       pos + count - 1, is_sync);
}

(By the way, did I mention Irix has range-fsync and range-fdatasync
system calls too :-) (actually fcntls))

-- Jamie

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-30 16:35     ` Jamie Lokier
@ 2009-08-30 16:39       ` Christoph Hellwig
  2009-08-30 17:29         ` Jamie Lokier
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-08-30 16:39 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Christoph Hellwig, Jan Kara, LKML, hch, linux-fsdevel,
	Evgeniy Polyakov, ocfs2-devel, Joel Becker, Felix Blyakher, xfs,
	Anton Altaparmakov, linux-ntfs-dev, OGAWA Hirofumi, linux-ext4,
	tytso

On Sun, Aug 30, 2009 at 05:35:51PM +0100, Jamie Lokier wrote:
> I like that.  It looks really clear and self-documenting, if
> vfs_fsync_range does what it sounds like, which is a nice change.
> 
> If I've guessed right what that code does, proper O_RSYNC will be easy:
> 
> int generic_sync_before_read(struct file *file, loff_t pos, loff_t count)
> {
> 	int is_sync = ((file->f_flags & O_SYNC)
> 		       || IS_SYNC(file->f_mapping->host));
> 	int is_dsync = is_sync || (file->f_flags & O_DSYNC);
> 
> 	if (!is_dsync || !(file->f_flags & O_RSYNC))
> 		return 0;
> 	return vfs_fsync_range(file, file->f_ath.denty, pos,
> 			       pos + count - 1, is_sync);
> }

Yes. something like this.

> (By the way, did I mention Irix has range-fsync and range-fdatasync
> system calls too :-) (actually fcntls))

Linux has sync_file_range which currently is a perfect way to lose your
synced' data, but with two more flags and calls to ->fsync we could
turn it into range-fsync/fdatasync.  I'm not sure if that's a good
idea or if we should just add a sys_fdatasync_rage systems call.

I don't quite see the point of a range-fsync, but it could be easily
implemented as a flag.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-30 16:39       ` Christoph Hellwig
@ 2009-08-30 17:29         ` Jamie Lokier
  0 siblings, 0 replies; 18+ messages in thread
From: Jamie Lokier @ 2009-08-30 17:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Christoph Hellwig, Jan Kara, LKML, linux-fsdevel,
	Evgeniy Polyakov, ocfs2-devel, Joel Becker, Felix Blyakher, xfs,
	Anton Altaparmakov, linux-ntfs-dev, OGAWA Hirofumi, linux-ext4,
	tytso

Christoph Hellwig wrote:
> Linux has sync_file_range which currently is a perfect way to lose your
> synced' data, but with two more flags and calls to ->fsync we could
> turn it into range-fsync/fdatasync.

Apart from the way it loses your data, the man page for
sync_file_range never manages to explain quite why you should use the
existing flags in various combinations.  It's only obvious if you've
worked on a kernel yourself.

Having asked this before, it appears one of the reasons for
sync_file_range working as it does is to give the application more
control over writeback order and to some extent, reduce the amount of
blocking.

But it's really difficult to manage the amount of blocking with it.
You need to know the request queue size among other things, and even
if you do it's dynamic.  Writeback order would be as easy with
fdatasync_range, and if you want to reduce blocking, a good
implementation of aio_fsync would be more useful.  Or, you have to use
application writeback threads anyway, so fdatasync_range again.

The one thing sync_file_range can do is let you submit multiple ranges
which the elevators can sort for the hardware.  You can't do that with
sequential calls to fdatasync_range, and it's not clear that aio_fsync
is implemented well enough (but it's a fairly good API for it).

Nick Piggin's idea to let fdatasync_range take multiple ranges might
help with that, but it's not clear how much.

> I'm not sure if that's a good
> idea or if we should just add a sys_fdatasync_rage systems call.

fdatasync_range has the advantage of being comprehensible.  People
will use it because it makes sense.

sync_file_range could be hijacked with new flags to implement
fdatasync_range.  If that's done, I'd rename the system call, but keep
it compatible with sync_file_range's flags, which would never be set
when userspace uses the new functionality.

> I don't quite see the point of a range-fsync, but it could be easily
> implemented as a flag.

A flags argument would be good anyway: to indicate if we want an
ordinary fdatasync, or something which flushes the relevant bit of
volatile hardware caches too.  With that as a capability, it is useful
to offer fsync, because that'd be the only way to get a volatile
hardware cache flush (or maybe the only way not to?).

For that reason, it should be permitted to give an infinitely large range.

I don't see the point of range-fsync either, but I'm not sure if I see
any harm in it.  If permitted, range-fsync with a zero-byte range
would flush just the inode state and none of the data.  If that's
technically available, maybe O_ISYNC and "#define O_SYNC
(O_DATASYNC|O_ISYNC)" isn't such as daft idea.

I'd call it fsync_range for consistency with aio_fsync (POSIX), which
takes flags O_DSYNC or O_SYNC to indicate the type of sync.  But I'd
use new flag names, to keep the space clear for other flags.  Just
sketching some ideas:

/* One of FSYNC_RANGE_SYNC or FSYNC_RANGE_DATASYNC must be set. */

#define FSYNC_RANGE_SYNC	(1 << 0)	/* Like fsync, O_SYNC. */
#define FSYNC_RANGE_DATASYNC	(1 << 1)	/* Like fdatasync, O_DSYNC. */
#define FSYNC_RANGE_NO_HWCACHE	(1 << 2)	/* Not hardware caches. */

-- Jamie

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2009-08-30 17:29 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
2009-08-19 16:26   ` Christoph Hellwig
2009-08-20 12:15     ` Jan Kara
2009-08-20 16:27       ` Christoph Hellwig
2009-08-21 15:23         ` Jan Kara
2009-08-21 15:32           ` Christoph Hellwig
2009-08-21 15:48             ` Jan Kara
2009-08-26 18:22         ` Christoph Hellwig
2009-08-27  0:04           ` Christoph Hellwig
2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
     [not found] <1250874001-15483-1-git-send-email-jack@suse.cz>
2009-08-21 16:59 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
     [not found] <1250875447-15622-1-git-send-email-jack@suse.cz>
2009-08-21 17:23 ` Jan Kara
2009-08-27 17:35   ` Christoph Hellwig
2009-08-30 16:35     ` Jamie Lokier
2009-08-30 16:39       ` Christoph Hellwig
2009-08-30 17:29         ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).