[PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS

linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:26   ` Christoph Hellwig
  2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML
  Cc: hch, Jan Kara, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

Introduce new function for generic inode syncing (generic_sync_file) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c        |   22 ++++----------
 fs/sync.c          |   81 +++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |    7 ++++
 mm/filemap.c       |   16 ++++------
 4 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 73766d2..8190237 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba6..fc320aa 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -176,23 +176,30 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * generic_sync_file - helper to sync data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @what:		what should be synced
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * What the function exactly does is controlled by the @what parameter:
  *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
+ * If SYNC_SUBMIT_DATA is set, the function submits all pages in the given
+ * range to disk.
+ *
+ * The function always calls ->fsync() callback of the filesystem. If
+ * SYNC_INODE is not set, we pass down the fact that it is just a datasync.
+ *
+ * If SYNC_WAIT_DATA is set, the function waits for writeback to finish
+ * in the given range.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int generic_sync_file(struct file *file, struct dentry *dentry, loff_t start,
+		      loff_t end, int what)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
-	int err, ret;
+	int err, ret = 0;
 
 	/*
 	 * Get mapping and operations from the file in case we have
@@ -212,23 +219,50 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	if (what & SYNC_SUBMIT_DATA)
+		ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = fop->fsync(file, dentry, datasync);
+	err = fop->fsync(file, dentry, !(what & SYNC_INODE));
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
+
+	if (what & SYNC_WAIT_DATA) {
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (!ret)
+			ret = err;
+	}
 out:
 	return ret;
 }
+EXPORT_SYMBOL(generic_sync_file);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	int what = SYNC_SUBMIT_DATA | SYNC_WAIT_DATA;
+
+	if (!datasync)
+		what |= SYNC_INODE;
+	return generic_sync_file(file, dentry, 0, LLONG_MAX, what);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +288,25 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ * FIXME: Make it inline?
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return generic_sync_file(file, file->f_path.dentry, pos,
+				 pos + count - 1,
+				 SYNC_SUBMIT_DATA | SYNC_WAIT_DATA);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1edd159..0f13049 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2088,7 +2088,14 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 
+/* Flags for generic_sync_file */
+#define SYNC_INODE		1
+#define SYNC_SUBMIT_DATA	2
+#define SYNC_WAIT_DATA		4
+extern int generic_sync_file(struct file *file, struct dentry *dentry,
+			   loff_t start, loff_t end, int what);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index dffdc60..a29b35b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -2452,18 +2451,16 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
 		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
@@ -2486,8 +2483,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
@@ -2496,10 +2492,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
@ 2009-08-19 16:26   ` Christoph Hellwig
  2009-08-20 12:15     ` Jan Kara
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2009-08-19 16:26 UTC (permalink / raw)
  To: Jan Kara
  Cc: LKML, hch, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

Looks good to me.  Eventually we should use those SYNC_ flags also all
through the fsync codepath, but I'll see if I can incorporate that in my
planned fsync rewrite.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-19 16:26   ` Christoph Hellwig
@ 2009-08-20 12:15     ` Jan Kara
  2009-08-20 16:27       ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Kara @ 2009-08-20 12:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> Looks good to me.  Eventually we should use those SYNC_ flags also all
> through the fsync codepath, but I'll see if I can incorporate that in my
> planned fsync rewrite.
  Yes, I thought I'll leave that for later. BTW it should be fairly easy to
teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
filesystem sets some flag in inode->i_mapping (or somewhere else) as is
needed for XFS, btrfs, etc.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 12:15     ` Jan Kara
@ 2009-08-20 16:27       ` Christoph Hellwig
  2009-08-21 15:23         ` Jan Kara
  2009-08-26 18:22         ` Christoph Hellwig
  0 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2009-08-20 16:27 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Thu, Aug 20, 2009 at 02:15:31PM +0200, Jan Kara wrote:
> On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> > Looks good to me.  Eventually we should use those SYNC_ flags also all
> > through the fsync codepath, but I'll see if I can incorporate that in my
> > planned fsync rewrite.
>   Yes, I thought I'll leave that for later. BTW it should be fairly easy to
> teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
> filesystem sets some flag in inode->i_mapping (or somewhere else) as is
> needed for XFS, btrfs, etc.

Maybe you can help brain storming, but I still can't see any way in that
the

  - write data
  - write inode
  - wait for data

actually is a benefit in terms of semantics (I agree that it could be
faster in theory, but even that is debatable with todays seek latencies
in disks)

Think about a simple non-journaling filesystem like ext2:

 (1) block get allocated during ->write before putting data in
      - this dirties the inode because we update i_block/i_size/etc
 (2) we call fsync (or the O_SNC handling code for that matter)
      - we start writeout of the data, which takes forever because the
	file is very large
      - then we write out the inode, including the i_size/i_blocks
	update
      - due to some reason this gets reordered before the data writeout
	finishes (without that happening there would be no benefit to
	this ordering anyway)
 (3) no we call filemap_fdatawait to wait for data I/O to finish


Now the system crashes between (2) and (3).  After that we we do have
stale data in the inode in the area not written yet.

Is there some case between that simple filesystem and the i_size update
from I/O completion handler in XFS/ext4 where this behaviour actually
buys us anything?  Any ext3 magic maybe?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 16:27       ` Christoph Hellwig
@ 2009-08-21 15:23         ` Jan Kara
  2009-08-21 15:32           ` Christoph Hellwig
  2009-08-26 18:22         ` Christoph Hellwig
  1 sibling, 1 reply; 14+ messages in thread
From: Jan Kara @ 2009-08-21 15:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Thu 20-08-09 12:27:29, Christoph Hellwig wrote:
> On Thu, Aug 20, 2009 at 02:15:31PM +0200, Jan Kara wrote:
> > On Wed 19-08-09 12:26:38, Christoph Hellwig wrote:
> > > Looks good to me.  Eventually we should use those SYNC_ flags also all
> > > through the fsync codepath, but I'll see if I can incorporate that in my
> > > planned fsync rewrite.
> >   Yes, I thought I'll leave that for later. BTW it should be fairly easy to
> > teach generic_sync_file() to do fdatawait() before calling ->fsync() if the
> > filesystem sets some flag in inode->i_mapping (or somewhere else) as is
> > needed for XFS, btrfs, etc.
> 
> Maybe you can help brain storming, but I still can't see any way in that
> the
> 
>   - write data
>   - write inode
>   - wait for data
> 
> actually is a benefit in terms of semantics (I agree that it could be
> faster in theory, but even that is debatable with todays seek latencies
> in disks)
> 
> Think about a simple non-journaling filesystem like ext2:
> 
>  (1) block get allocated during ->write before putting data in
>       - this dirties the inode because we update i_block/i_size/etc
>  (2) we call fsync (or the O_SNC handling code for that matter)
>       - we start writeout of the data, which takes forever because the
> 	file is very large
>       - then we write out the inode, including the i_size/i_blocks
> 	update
>       - due to some reason this gets reordered before the data writeout
> 	finishes (without that happening there would be no benefit to
> 	this ordering anyway)
>  (3) no we call filemap_fdatawait to wait for data I/O to finish
> 
> Now the system crashes between (2) and (3).  After that we we do have
> stale data in the inode in the area not written yet.
  Yes, that's true.

> Is there some case between that simple filesystem and the i_size update
> from I/O completion handler in XFS/ext4 where this behaviour actually
> buys us anything?  Any ext3 magic maybe?
  Hmm, I can imagine it would buy us something in two cases (but looking at
the code, neither is implemented in such a way that it would really help
us in any way):
1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
   this.
2) when we journal data
  In the first case we would wait for block with data to be written only to
submit it again because inode was still dirty.
  In the second case, it would make sence if we waited for transaction
commit in fdatawait() because only then data is really on disk. But I don't
know about a fs which would do it - ext3 in data=journal mode just adds
page buffers to the current transaction in writepage and never sets
PageWriteback so fdatawait() is nop for it. The page is pinned in memory
only by the fact that its buffer heads are part of a transaction and thus
cannot be freed.
  So currently I don't know about real cases where fdatawait after ->fsync()
would buy us anything...

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-21 15:23         ` Jan Kara
@ 2009-08-21 15:32           ` Christoph Hellwig
  2009-08-21 15:48             ` Jan Kara
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2009-08-21 15:32 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Fri, Aug 21, 2009 at 05:23:39PM +0200, Jan Kara wrote:
>   Hmm, I can imagine it would buy us something in two cases (but looking at
> the code, neither is implemented in such a way that it would really help
> us in any way):
> 1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
>    this.

>   In the first case we would wait for block with data to be written only to
> submit it again because inode was still dirty.

But we would not actually store in the pagecache or at least never
writeback it via the pacache in that case, but always just write it as
part of the inode.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-21 15:32           ` Christoph Hellwig
@ 2009-08-21 15:48             ` Jan Kara
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-21 15:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, LKML, Evgeniy Polyakov, ocfs2-devel, Joel Becker,
	Felix Blyakher, xfs, Anton Altaparmakov, linux-ntfs-dev,
	OGAWA Hirofumi, linux-ext4, tytso

On Fri 21-08-09 11:32:35, Christoph Hellwig wrote:
> On Fri, Aug 21, 2009 at 05:23:39PM +0200, Jan Kara wrote:
> >   Hmm, I can imagine it would buy us something in two cases (but looking at
> > the code, neither is implemented in such a way that it would really help
> > us in any way):
> > 1) when an inode and it's data are stored in one block (e.g. OCFS2 or UDF) do
> >    this.
> 
> >   In the first case we would wait for block with data to be written only to
> > submit it again because inode was still dirty.
> 
> But we would not actually store in the pagecache or at least never
> writeback it via the pacache in that case, but always just write it as
> part of the inode.
  Yes, probably. Actually, that's what UDF does. It's just that then the
sequence fdatawrite(), fdatawait() does not really send the data to disk.
But that probably does not matter.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-20 16:27       ` Christoph Hellwig
  2009-08-21 15:23         ` Jan Kara
@ 2009-08-26 18:22         ` Christoph Hellwig
  2009-08-27  0:04           ` Christoph Hellwig
  1 sibling, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2009-08-26 18:22 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Thu, Aug 20, 2009 at 12:27:29PM -0400, Christoph Hellwig wrote:
> Maybe you can help brain storming, but I still can't see any way in that
> the
> 
>   - write data
>   - write inode
>   - wait for data
> 
> actually is a benefit in terms of semantics (I agree that it could be
> faster in theory, but even that is debatable with todays seek latencies
> in disks)

Btw, another reason why our current default is actively harmful:

	barriers

With volatile write caches we do have to flush the disk write cache in
->fsync, either implicitly by a metadata operation, or explicitly if
only data changed.  Unless the filesystems waits itself for the data
to hit the disk like XFS or btrfs will be issue the cache flush
potentially before the data write has actually reached the disk cache.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  2009-08-26 18:22         ` Christoph Hellwig
@ 2009-08-27  0:04           ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2009-08-27  0:04 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, LKML, Evgeniy Polyakov, ocfs2-devel,
	Joel Becker, Felix Blyakher, xfs, Anton Altaparmakov,
	linux-ntfs-dev, OGAWA Hirofumi, linux-ext4, tytso

On Wed, Aug 26, 2009 at 02:22:36PM -0400, Christoph Hellwig wrote:
> On Thu, Aug 20, 2009 at 12:27:29PM -0400, Christoph Hellwig wrote:
> > Maybe you can help brain storming, but I still can't see any way in that
> > the
> > 
> >   - write data
> >   - write inode
> >   - wait for data
> > 
> > actually is a benefit in terms of semantics (I agree that it could be
> > faster in theory, but even that is debatable with todays seek latencies
> > in disks)
> 
> Btw, another reason why our current default is actively harmful:
> 
> 	barriers
> 
> With volatile write caches we do have to flush the disk write cache in
> ->fsync, either implicitly by a metadata operation, or explicitly if
> only data changed.  Unless the filesystems waits itself for the data
> to hit the disk like XFS or btrfs will be issue the cache flush
> potentially before the data write has actually reached the disk cache.

Ok, this one failed the reality check - no matter how hard I tried
I could not reproduce that case in my test harness.  It turns out
cache flush request are quite sensibly treated as barriers by the block
layer and thus we drain the queue before issueing the cache flush.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 08/17] ext2: Update comment about generic_osync_inode
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
  2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
  3 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4

We rely on generic_write_sync() now.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e271303..1c1638f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
 		unlock_buffer(bh);
 		mark_buffer_dirty_inode(bh, inode);
 		/* We used to sync bh here if IS_SYNC(inode).
-		 * But we now rely upon generic_osync_inode()
+		 * But we now rely upon generic_write_sync()
 		 * and b_inode_buffers.  But not for directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
  2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
  2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
  3 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c |   61 +-------------------------------------------------------
 1 files changed, 1 insertions(+), 60 deletions(-)

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704..51fee5f 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write
       [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
                   ` (2 preceding siblings ...)
  2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
@ 2009-08-19 16:04 ` Jan Kara
  3 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-19 16:04 UTC (permalink / raw)
  To: LKML; +Cc: hch, Jan Kara, linux-ext4, tytso

The syncing is now properly handled by generic_file_aio_write() so
no special ext4 code is needed.

CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c |   53 ++---------------------------------------------------
 1 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873f..aafe432 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext4_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */

^ permalink raw reply related	[flat|nested] 14+ messages in thread

[parent not found: <1250874001-15483-1-git-send-email-jack@suse.cz>]

* [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write
       [not found] <1250874001-15483-1-git-send-email-jack@suse.cz>
@ 2009-08-21 16:59 ` Jan Kara
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-21 16:59 UTC (permalink / raw)
  To: LKML; +Cc: hch, linux-fsdevel, Jan Kara, linux-ext4

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c |   61 +-------------------------------------------------------
 1 files changed, 1 insertions(+), 60 deletions(-)

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704..51fee5f 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
1.6.0.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

[parent not found: <1250875447-15622-1-git-send-email-jack@suse.cz>]

* [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write
       [not found] <1250875447-15622-1-git-send-email-jack@suse.cz>
@ 2009-08-21 17:23 ` Jan Kara
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Kara @ 2009-08-21 17:23 UTC (permalink / raw)
  To: LKML; +Cc: hch, linux-fsdevel, Jan Kara, linux-ext4

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c |   61 +-------------------------------------------------------
 1 files changed, 1 insertions(+), 60 deletions(-)

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704..51fee5f 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
1.6.0.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2009-08-27  0:04 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1250697884-22288-1-git-send-email-jack@suse.cz>
2009-08-19 16:04 ` [PATCH 07/17] vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Jan Kara
2009-08-19 16:26   ` Christoph Hellwig
2009-08-20 12:15     ` Jan Kara
2009-08-20 16:27       ` Christoph Hellwig
2009-08-21 15:23         ` Jan Kara
2009-08-21 15:32           ` Christoph Hellwig
2009-08-21 15:48             ` Jan Kara
2009-08-26 18:22         ` Christoph Hellwig
2009-08-27  0:04           ` Christoph Hellwig
2009-08-19 16:04 ` [PATCH 08/17] ext2: Update comment about generic_osync_inode Jan Kara
2009-08-19 16:04 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
2009-08-19 16:04 ` [PATCH 10/17] ext4: Remove syncing logic from ext4_file_write Jan Kara
     [not found] <1250874001-15483-1-git-send-email-jack@suse.cz>
2009-08-21 16:59 ` [PATCH 09/17] ext3: Remove syncing logic from ext3_file_write Jan Kara
     [not found] <1250875447-15622-1-git-send-email-jack@suse.cz>
2009-08-21 17:23 ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).