public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* Why is fsync so much slower than O_SYNC?
@ 2004-03-17 16:40 Jeffrey Siegal
  2004-03-18  8:33 ` Andrew Morton
  0 siblings, 1 reply; 5+ messages in thread
From: Jeffrey Siegal @ 2004-03-17 16:40 UTC (permalink / raw)
  To: linux-kernel

This is on FC1 with Fedora/Red Hat's 2.4.22-1.2174.nptlsmp kernel, 
writing to an ext3 file system (journal=ordered) on a 7200rpm IDE drive.

O_SYNC:
Creating
Starting
iter = 1000, latency = 8.413535ms

O_DSYNC:
Creating
Starting
iter = 1000, latency = 8.429431ms

fsync:
Creating
Starting
iter = 1000, latency = 34.499984ms

fdatasync:
Creating
Starting
iter = 1000, latency = 35.568508ms


--

#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <assert.h>

#define ITER 1000
#define FSIZE (512*ITER)

static inline unsigned long microtime()
{
   struct timeval tv;
   int ret;
   ret = gettimeofday(&tv, 0);
   assert(!ret);

   return tv.tv_sec * 1000000UL + tv.tv_usec;
}

#define OP_OPEN 1
#define OP_SYNC 2

static int do_open(char *fn, int extraflags)
{
   return open(fn, O_CREAT | O_TRUNC | O_WRONLY | extraflags, 0644);
}

static int flushmethod_O_SYNC(int op, int iarg, char * sarg) {
   switch (op) {
   case OP_OPEN:
     return do_open(sarg, O_SYNC);
   case OP_SYNC:
     return 0;
   default:
     assert(0);
   }
}

static int flushmethod_O_DSYNC(int op, int iarg, char * sarg)
{
   switch (op) {
   case OP_OPEN:
     return do_open(sarg, O_DSYNC);
   case OP_SYNC:
     return 0;
   default:
     assert(0);
   }
}

static int flushmethod_fsync(int op, int iarg, char * sarg)
{
   switch (op) {
   case OP_OPEN:
     return do_open(sarg, 0);
   case OP_SYNC:
     return fsync(iarg);
   default:
     assert(0);
   }
}
static int flushmethod_fdatasync(int op, int iarg, char * sarg)
{
   switch (op) {
   case OP_OPEN:
     return do_open(sarg, 0);
   case OP_SYNC:
     return fdatasync(iarg);
   default:
     assert(0);
   }
}

int runtest(int (*flushmethod)(int op, int iarg, char * sarg))
{
   char *filename = "tmp.tmp";
   int fd = flushmethod(OP_OPEN, 0, filename);
   int ret;

   printf("Creating\n");
   {
     char *p = calloc(1, FSIZE);
     ret = write(fd, p, FSIZE);
     if (ret != FSIZE) {
       printf("%d\n", ret);
       assert(0);
     }
     free(p);
   }
   ret = flushmethod(OP_SYNC, fd, 0);
   assert(ret == 0);

   printf("Starting\n");
   unsigned long start = microtime();
   int iter = ITER;
   int offset = 0;
   while (iter--) {
     char ch = 'A';
     ret = pwrite(fd, &ch, 1, offset % FSIZE);
     assert(ret == 1);
     ret = flushmethod(OP_SYNC, fd, 0);
     assert(ret == 0);
     offset += 512;
   }

   close(fd);
   unlink(filename);
   printf("iter = %d, latency = %lfms\n", ITER, (microtime() - 
start)/(ITER*1000.0));
}

int main(int argc, char *argv)
{
   printf("\nREADME: Make sure you have turned off hardware write 
caching (hdparm -W0 /dev/hda for IDE)\n\n");
   printf("O_SYNC:\n");
   runtest(flushmethod_O_SYNC);
   printf("\n");
   printf("O_DSYNC:\n");
   runtest(flushmethod_O_DSYNC);
   printf("\n");
   printf("fsync:\n");
   runtest(flushmethod_fsync);
   printf("\n");
   printf("fdatasync:\n");
   runtest(flushmethod_fdatasync);
}


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Why is fsync so much slower than O_SYNC?
  2004-03-17 16:40 Why is fsync so much slower than O_SYNC? Jeffrey Siegal
@ 2004-03-18  8:33 ` Andrew Morton
  2004-03-18 15:42   ` Jeffrey Siegal
  2004-03-18 16:36   ` Jeffrey Siegal
  0 siblings, 2 replies; 5+ messages in thread
From: Andrew Morton @ 2004-03-18  8:33 UTC (permalink / raw)
  To: Jeffrey Siegal; +Cc: linux-kernel

Jeffrey Siegal <jbs@quiotix.com> wrote:
>
> This is on FC1 with Fedora/Red Hat's 2.4.22-1.2174.nptlsmp kernel, 
>  writing to an ext3 file system (journal=ordered) on a 7200rpm IDE drive.
> 
>  O_SYNC:
>  Creating
>  Starting
>  iter = 1000, latency = 8.413535ms
> 
>  O_DSYNC:
>  Creating
>  Starting
>  iter = 1000, latency = 8.429431ms
> 
>  fsync:
>  Creating
>  Starting
>  iter = 1000, latency = 34.499984ms
> 
>  fdatasync:
>  Creating
>  Starting
>  iter = 1000, latency = 35.568508ms

This is because ext3 is being dumb.  It is forcing a journal commit even
when the inode has not been altered.

Here's a moderately-tested 2.6 fix.  A 2.4 fix would be similar.


File overwrite:

	vmm:/mnt/hda5> ~/fsync-is-sucky 
	O_SYNC:
	Creating
	Starting
	iter = 1000, latency = 0.191464ms

	O_DSYNC:
	Creating
	Starting
	iter = 1000, latency = 0.181637ms

	fsync:
	Creating
	Starting
	iter = 1000, latency = 0.186326ms

	fdatasync:
	Creating
	Starting
	iter = 1000, latency = 0.193013ms

File append:

	vmm:/mnt/hda5> ~/fsync-is-sucky 1
	O_SYNC:
	Starting
	iter = 1000, latency = 6.169613ms

	O_DSYNC:
	Starting
	iter = 1000, latency = 6.131574ms

	fsync:
	Starting
	iter = 1000, latency = 6.140611ms

	fdatasync:
	Starting
	iter = 1000, latency = 6.111737ms



---

 25-akpm/fs/ext3/fsync.c     |   38 ++++++++++++++++++++++++++++----------
 25-akpm/fs/fs-writeback.c   |   36 +++++++++++++++++++++++++++++++-----
 25-akpm/include/linux/fs.h  |    1 +
 25-akpm/mm/page-writeback.c |    2 ++
 4 files changed, 62 insertions(+), 15 deletions(-)

diff -puN fs/ext3/fsync.c~ext3-fsync-speedup fs/ext3/fsync.c
--- 25/fs/ext3/fsync.c~ext3-fsync-speedup	2004-03-17 23:35:29.679572200 -0800
+++ 25-akpm/fs/ext3/fsync.c	2004-03-18 00:25:20.604882440 -0800
@@ -24,6 +24,8 @@
 
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -38,29 +40,28 @@
  *
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
- *
- * Note that there is a serious optimisation we can make here: if the current
- * inode is not part of j_running_transaction or j_committing_transaction
- * then we have nothing to do.  That would require implementation of t_ilist,
- * which isn't too hard.
  */
 
 int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	int ret = 0;
 
 	J_ASSERT(ext3_journal_current_handle() == 0);
 
+	smp_mb();		/* prepare for lockless i_state read */
+	if (!(inode->i_state & I_DIRTY))
+		goto out;
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  ext3_force_commit() will sync the metadata
+	 *  sync_inode() will sync the metadata
 	 *
 	 * data=ordered:
 	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  ext3_force_commit() will wait on the buffers.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages (but all IO is complete)
-	 *  Not pretty, but it works.
+	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
+	 *  filemap_fdatawait() will wait on the pages.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -70,5 +71,22 @@ int ext3_sync_file(struct file * file, s
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	return ext3_force_commit(inode->i_sb);
+	if (ext3_should_journal_data(inode)) {
+		ret = ext3_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	/*
+	 * The VFS has written the file data.  If the inode is unaltered
+	 * then we need not start a commit.
+	 */
+	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0, /* sys_fsync did this */
+		};
+		ret = sync_inode(inode, &wbc);
+	}
+out:
+	return ret;
 }
diff -puN fs/fs-writeback.c~ext3-fsync-speedup fs/fs-writeback.c
--- 25/fs/fs-writeback.c~ext3-fsync-speedup	2004-03-17 23:35:29.680572048 -0800
+++ 25-akpm/fs/fs-writeback.c	2004-03-18 00:32:32.548217072 -0800
@@ -131,13 +131,14 @@ static void write_inode(struct inode *in
  *
  * Called under inode_lock.
  */
-static void
+static int
 __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	unsigned dirty;
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
+	int ret;
 
 	BUG_ON(inode->i_state & I_LOCK);
 
@@ -158,14 +159,17 @@ __sync_single_inode(struct inode *inode,
 	spin_unlock(&mapping->page_lock);
 	spin_unlock(&inode_lock);
 
-	do_writepages(mapping, wbc);
+	ret = do_writepages(mapping, wbc);
 
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
 		write_inode(inode, wait);
 
-	if (wait)
-		filemap_fdatawait(mapping);
+	if (wait) {
+		int err = filemap_fdatawait(mapping);
+		if (ret == 0)
+			ret = err;
+	}
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
@@ -189,6 +193,7 @@ __sync_single_inode(struct inode *inode,
 		}
 	}
 	wake_up_inode(inode);
+	return ret;
 }
 
 /*
@@ -493,10 +498,31 @@ void write_inode_now(struct inode *inode
 	if (sync)
 		wait_on_inode(inode);
 }
-
 EXPORT_SYMBOL(write_inode_now);
 
 /**
+ * sync_inode - write an inode and its pages to disk.
+ * @inode: the inode to sync
+ * @wbc: controls the writeback mode
+ *
+ * sync_inode() will write an inode and its pages to disk.  It will also
+ * correctly update the inode on its superblock's disty inode lists and will
+ * update inode->i_state.
+ *
+ * The caller must have a ref on the inode.
+ */
+int sync_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	spin_lock(&inode_lock);
+	ret = __sync_single_inode(inode, wbc);
+	spin_unlock(&inode_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sync_inode);
+
+/**
  * generic_osync_inode - flush all dirty data for a given inode to disk
  * @inode: inode to write
  * @what:  what to write and wait upon
diff -puN include/linux/fs.h~ext3-fsync-speedup include/linux/fs.h
--- 25/include/linux/fs.h~ext3-fsync-speedup	2004-03-17 23:35:29.682571744 -0800
+++ 25-akpm/include/linux/fs.h	2004-03-17 23:35:29.687570984 -0800
@@ -919,6 +919,7 @@ static inline void mark_inode_dirty_sync
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+int sync_inode(struct inode *inode, struct writeback_control *wbc);
 
 /**
  * &export_operations - for nfsd to communicate with file systems
diff -puN mm/page-writeback.c~ext3-fsync-speedup mm/page-writeback.c
--- 25/mm/page-writeback.c~ext3-fsync-speedup	2004-03-17 23:35:29.683571592 -0800
+++ 25-akpm/mm/page-writeback.c	2004-03-17 23:35:29.688570832 -0800
@@ -486,6 +486,8 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	if (wbc->nr_to_write <= 0)
+		return 0;
 	if (mapping->a_ops->writepages)
 		return mapping->a_ops->writepages(mapping, wbc);
 	return generic_writepages(mapping, wbc);

_


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Why is fsync so much slower than O_SYNC?
  2004-03-18  8:33 ` Andrew Morton
@ 2004-03-18 15:42   ` Jeffrey Siegal
  2004-03-18 17:07     ` Andrew Morton
  2004-03-18 16:36   ` Jeffrey Siegal
  1 sibling, 1 reply; 5+ messages in thread
From: Jeffrey Siegal @ 2004-03-18 15:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Andrew Morton wrote:
> This is because ext3 is being dumb.  It is forcing a journal commit even
> when the inode has not been altered.
> 
> Here's a moderately-tested 2.6 fix.  A 2.4 fix would be similar.
> 
> 
> File overwrite:
> 
> 	vmm:/mnt/hda5> ~/fsync-is-sucky 
> 	O_SYNC:
> 	Creating
> 	Starting
> 	iter = 1000, latency = 0.191464ms
> 
> 	O_DSYNC:
> 	Creating
> 	Starting
> 	iter = 1000, latency = 0.181637ms
> 
> 	fsync:
> 	Creating
> 	Starting
> 	iter = 1000, latency = 0.186326ms
> 
> 	fdatasync:
> 	Creating
> 	Starting
> 	iter = 1000, latency = 0.193013ms

Hmm, this doesn't seem right.  For an overwrite, it should still be 
waiting for just the data page to be written out, and that should take 
about 8ms (for a 7200rpm drive), and the append case should take longer, 
since it requires writing both the data page and then the journal.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Why is fsync so much slower than O_SYNC?
  2004-03-18  8:33 ` Andrew Morton
  2004-03-18 15:42   ` Jeffrey Siegal
@ 2004-03-18 16:36   ` Jeffrey Siegal
  1 sibling, 0 replies; 5+ messages in thread
From: Jeffrey Siegal @ 2004-03-18 16:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Some odd results on ext2 as well:

README: Make sure you have turned off hardware write caching (hdparm -W0 
/dev/hda for IDE)

O_SYNC:
Creating
Starting
iter = 1000, latency = 8.419908ms

O_DSYNC:
Creating
Starting
iter = 1000, latency = 8.369892ms

fsync:
Creating
Starting
iter = 1000, latency = 13.191030ms

fdatasync:
Creating
Starting
iter = 1000, latency = 8.829373ms


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Why is fsync so much slower than O_SYNC?
  2004-03-18 15:42   ` Jeffrey Siegal
@ 2004-03-18 17:07     ` Andrew Morton
  0 siblings, 0 replies; 5+ messages in thread
From: Andrew Morton @ 2004-03-18 17:07 UTC (permalink / raw)
  To: Jeffrey Siegal; +Cc: linux-kernel

Jeffrey Siegal <jbs@quiotix.com> wrote:
>
> Andrew Morton wrote:
> > This is because ext3 is being dumb.  It is forcing a journal commit even
> > when the inode has not been altered.
> > 
> > Here's a moderately-tested 2.6 fix.  A 2.4 fix would be similar.
> > 
> > 
> > File overwrite:
> > 
> > 	vmm:/mnt/hda5> ~/fsync-is-sucky 
> > 	O_SYNC:
> > 	Creating
> > 	Starting
> > 	iter = 1000, latency = 0.191464ms
> > 
> > 	O_DSYNC:
> > 	Creating
> > 	Starting
> > 	iter = 1000, latency = 0.181637ms
> > 
> > 	fsync:
> > 	Creating
> > 	Starting
> > 	iter = 1000, latency = 0.186326ms
> > 
> > 	fdatasync:
> > 	Creating
> > 	Starting
> > 	iter = 1000, latency = 0.193013ms
> 
> Hmm, this doesn't seem right.  For an overwrite, it should still be 
> waiting for just the data page to be written out, and that should take 
> about 8ms (for a 7200rpm drive),

That's with the disk writeback cache enabled.

> and the append case should take longer, 
> since it requires writing both the data page and then the journal.

Append takes 40 times longer, because in that case we need to commit the
journal after each sync to write the modifed inode.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-03-18 17:09 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-03-17 16:40 Why is fsync so much slower than O_SYNC? Jeffrey Siegal
2004-03-18  8:33 ` Andrew Morton
2004-03-18 15:42   ` Jeffrey Siegal
2004-03-18 17:07     ` Andrew Morton
2004-03-18 16:36   ` Jeffrey Siegal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox