linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] add -o flush for fat
@ 2006-08-04 19:27 Chris Mason
  2006-08-05  1:31 ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Chris Mason @ 2006-08-04 19:27 UTC (permalink / raw)
  To: linux-fsdevel, akpm

From: Chris Mason <mason@suse.com>
Subject: add -o flush for fat

Fat is commonly used on removable media, mounting with -o flush tells the
FS to write things to disk as quickly as possible.  It is like -o sync, but
much faster (and not as safe).

Signed-off-by: Chris Mason <mason@suse.com>

--- a/fs/fat/file.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/fat/file.c	Fri Aug 04 15:25:09 2006 -0400
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -112,6 +113,17 @@ int fat_generic_ioctl(struct inode *inod
 	}
 }
 
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_mode & FMODE_WRITE) &&
+	     MSDOS_SB(inode->i_sb)->options.flush) {
+		writeback_inode(inode);
+		writeback_bdev(inode->i_sb);
+		blk_congestion_wait(WRITE, HZ/10);
+	}
+	return 0;
+}
+
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -121,6 +133,7 @@ const struct file_operations fat_file_op
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
+	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
 	.fsync		= file_fsync,
 	.sendfile	= generic_file_sendfile,
@@ -289,6 +302,10 @@ void fat_truncate(struct inode *inode)
 	lock_kernel();
 	fat_free(inode, nr_clusters);
 	unlock_kernel();
+	if (MSDOS_SB(inode->i_sb)->options.flush) {
+		writeback_inode(inode);
+		writeback_bdev(inode->i_sb);
+	}
 }
 
 struct inode_operations fat_file_inode_operations = {
--- a/fs/fat/inode.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/fat/inode.c	Fri Aug 04 14:02:26 2006 -0400
@@ -24,6 +24,7 @@
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
+#include <linux/writeback.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -861,7 +862,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_err,
 };
 
 static match_table_t fat_tokens = {
@@ -893,7 +894,8 @@ static match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_format=%20s"},
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
-	{Opt_err, NULL}
+	{Opt_flush, "flush"},
+	{Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
 	{Opt_nodots, "nodots"},
@@ -1034,6 +1036,9 @@ static int parse_options(char *options, 
 				return 0;
 			opts->codepage = option;
 			break;
+		case Opt_flush:
+			opts->flush = 1;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
--- a/fs/fs-writeback.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/fs-writeback.c	Fri Aug 04 14:41:46 2006 -0400
@@ -391,6 +391,41 @@ sync_sb_inodes(struct super_block *sb, s
 }
 
 /*
+ * starts IO on any dirty blocks in the block device.  This uses
+ * filemap_flush, and so it does not wait for the io to finish, and does
+ * not wait on any IO currently in flight.
+ */
+void writeback_bdev(struct super_block *sb)
+{
+	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	filemap_flush(mapping);
+}
+EXPORT_SYMBOL_GPL(writeback_bdev);
+
+/*
+ * start writeback on both the inode and the data blocks.
+ * filemap_fdatawrite is used, so it waits for any IO that was in
+ * flight for dirty blocks at the start of this call.  This will not
+ * wait for any IO it starts.
+ */
+void writeback_inode(struct inode *inode)
+{
+
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 0,
+	};
+	/* if we used WB_SYNC_ALL, sync_inode waits for the io for the
+	 * inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+	 * and filemap_fdatawrite is used for the data blocks
+	 */
+	sync_inode(inode, &wbc);
+	filemap_fdatawrite(mapping);
+}
+EXPORT_SYMBOL_GPL(writeback_inode);
+
+/*
  * Start writeback of dirty pagecache data against all unlocked inodes.
  *
  * Note:
--- a/fs/msdos/namei.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/msdos/namei.c	Fri Aug 04 14:02:26 2006 -0400
@@ -11,6 +11,7 @@
 #include <linux/buffer_head.h>
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include <linux/writeback.h>
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
@@ -280,7 +281,7 @@ static int msdos_create(struct inode *di
 			struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct fat_slot_info sinfo;
 	struct timespec ts;
 	unsigned char msdos_name[MSDOS_NAME];
@@ -316,6 +317,11 @@ static int msdos_create(struct inode *di
 	d_instantiate(dentry, inode);
 out:
 	unlock_kernel();
+	if (!err && MSDOS_SB(sb)->options.flush) {
+		writeback_inode(dir);
+		writeback_inode(inode);
+		writeback_bdev(sb);
+	}
 	return err;
 }
 
@@ -348,6 +354,11 @@ static int msdos_rmdir(struct inode *dir
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err && MSDOS_SB(inode->i_sb)->options.flush) {
+		writeback_inode(dir);
+		writeback_inode(inode);
+		writeback_bdev(inode->i_sb);
+	}
 
 	return err;
 }
@@ -401,6 +412,11 @@ static int msdos_mkdir(struct inode *dir
 	d_instantiate(dentry, inode);
 
 	unlock_kernel();
+	if (MSDOS_SB(sb)->options.flush) {
+		writeback_inode(dir);
+		writeback_inode(inode);
+		writeback_bdev(sb);
+	}
 	return 0;
 
 out_free:
@@ -430,6 +446,11 @@ static int msdos_unlink(struct inode *di
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err && MSDOS_SB(inode->i_sb)->options.flush) {
+		writeback_inode(dir);
+		writeback_inode(inode);
+		writeback_bdev(inode->i_sb);
+	}
 
 	return err;
 }
@@ -635,6 +656,11 @@ static int msdos_rename(struct inode *ol
 			      new_dir, new_msdos_name, new_dentry, is_hid);
 out:
 	unlock_kernel();
+	if (!err && MSDOS_SB(old_dir->i_sb)->options.flush) {
+		writeback_inode(old_dir);
+		writeback_inode(new_dir);
+		writeback_bdev(old_dir->i_sb);
+	}
 	return err;
 }
 
--- a/include/linux/msdos_fs.h	Fri Aug 04 14:02:26 2006 -0400
+++ b/include/linux/msdos_fs.h	Fri Aug 04 14:02:26 2006 -0400
@@ -204,6 +204,7 @@ struct fat_mount_options {
 		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
 		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
 		 atari:1,         /* Use Atari GEMDOS variation of MS-DOS fs */
+		 flush:1,	  /* write things quickly */
 		 nocase:1;	  /* Does this need case conversion? 0=need case conversion*/
 };
 
--- a/include/linux/writeback.h	Fri Aug 04 14:02:26 2006 -0400
+++ b/include/linux/writeback.h	Fri Aug 04 14:02:26 2006 -0400
@@ -69,6 +69,8 @@ int inode_wait(void *);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
 void sync_inodes(int wait);
+void writeback_bdev(struct super_block *);
+void writeback_inode(struct inode *);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-04 19:27 [PATCH] add -o flush for fat Chris Mason
@ 2006-08-05  1:31 ` Andrew Morton
  2006-08-05 12:26   ` Chris Mason
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2006-08-05  1:31 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel

On Fri, 4 Aug 2006 15:27:21 -0400
Chris Mason <mason@suse.com> wrote:

>  /*
> + * starts IO on any dirty blocks in the block device.  This uses
> + * filemap_flush, and so it does not wait for the io to finish, and does
> + * not wait on any IO currently in flight.
> + */
> +void writeback_bdev(struct super_block *sb)
> +{
> +	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
> +	filemap_flush(mapping);
> +}
> +EXPORT_SYMBOL_GPL(writeback_bdev);
> +
> +/*
> + * start writeback on both the inode and the data blocks.
> + * filemap_fdatawrite is used, so it waits for any IO that was in
> + * flight for dirty blocks at the start of this call.  This will not
> + * wait for any IO it starts.
> + */
> +void writeback_inode(struct inode *inode)
> +{
> +
> +	struct address_space *mapping = inode->i_mapping;
> +	struct writeback_control wbc = {
> +		.sync_mode = WB_SYNC_NONE,
> +		.nr_to_write = 0,
> +	};
> +	/* if we used WB_SYNC_ALL, sync_inode waits for the io for the
> +	 * inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
> +	 * and filemap_fdatawrite is used for the data blocks
> +	 */
> +	sync_inode(inode, &wbc);
> +	filemap_fdatawrite(mapping);
> +}
> +EXPORT_SYMBOL_GPL(writeback_inode);

Could we have more detail on what you're trying to do here?

You've obviously made some decisions about how to handle under-writeback
data, dirty data, integrity versus cleaning, etc.  What were those
decisions and what led you to them?

We should document the precise semantics of these two functions, especially
wrt data integrity, handling of dirty-but-under-writeback pages, etc. 
(Right now I'm not sure what those semantics are).

I'm also wondering what sync_inode(nr_to_write=0) does ;) I think it writes
one page...



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-05  1:31 ` Andrew Morton
@ 2006-08-05 12:26   ` Chris Mason
  2006-08-05 19:12     ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Chris Mason @ 2006-08-05 12:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel

On Fri, Aug 04, 2006 at 06:31:49PM -0700, Andrew Morton wrote:
> 
> Could we have more detail on what you're trying to do here?
> 
> You've obviously made some decisions about how to handle under-writeback
> data, dirty data, integrity versus cleaning, etc.  What were those
> decisions and what led you to them?

The basic idea is that someone watching the blinking light on the usb
stick knows that when the light goes out, he can pull the stick out.
It's an alternative to mount -o sync that it closer to what users are
asking for.

There's no attempt at integrity, it only tries to keep a fairly constant
flow of data to the device.

> 
> We should document the precise semantics of these two functions, especially
> wrt data integrity, handling of dirty-but-under-writeback pages, etc. 
> (Right now I'm not sure what those semantics are).

They are loosely defined.  One way to handle
dirty-but-under-writeback pages is to add a timer that
filemap_fdatawrites the block device.  Since fdatawrite is used in the
file release call, regular files don't need this, only the block device.

This timer isn't added in the patch, but it's an easy addition later.

> 
> I'm also wondering what sync_inode(nr_to_write=0) does ;) I think it writes
> one page...

It starts a write on the inode and nothing else ;)  There are lots of
ways to write the inode and wait, but this was the only one I found to
write the inode without waiting and then write the data pages while
waiting for in flight io.

-chris


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-05 12:26   ` Chris Mason
@ 2006-08-05 19:12     ` Andrew Morton
  2006-08-05 20:54       ` OGAWA Hirofumi
  2006-08-07 20:23       ` Chris Mason
  0 siblings, 2 replies; 8+ messages in thread
From: Andrew Morton @ 2006-08-05 19:12 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel, OGAWA Hirofumi

On Sat, 5 Aug 2006 08:26:29 -0400
Chris Mason <mason@suse.com> wrote:

> On Fri, Aug 04, 2006 at 06:31:49PM -0700, Andrew Morton wrote:
> > 
> > Could we have more detail on what you're trying to do here?
> > 
> > You've obviously made some decisions about how to handle under-writeback
> > data, dirty data, integrity versus cleaning, etc.  What were those
> > decisions and what led you to them?
> 
> The basic idea is that someone watching the blinking light on the usb
> stick knows that when the light goes out, he can pull the stick out.

Yes.

> It's an alternative to mount -o sync that it closer to what users are
> asking for.
> 
> There's no attempt at integrity, it only tries to keep a fairly constant
> flow of data to the device.

It's a data-integrity operation, because we want that filesystem to be in
good shape after the LED goes off.  Not after a pdflush interval of 30
seconds.

Consequently the semantics which we require of the library functions which
you've added should be:

/*
 * On return from this function all dirty data has been cleaned - it is either
 * on disk or is in flight in the I/O queues.
 */

These semantics are exactly provided by
do_sync_file_range(SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE).
I'd suggest that do_sync_file_range() could be used to sync the file's
data in this application, if only for reasons of clarity and precision.

That leaves the file metadata, the inode itself and the superblock, for
which we'd ideally likely same semantics, but stronger sync semantics would
of course meet our requirement.

metadata: we don't have an
SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE implementation of
fsync_buffers_list() so unless we add such, we need to use
fsync_buffers_list()'s
(SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER)
semantics.  I'd suggest a call to sync_mapping_buffers() directly.

The inode: write_inode_now(wait=0) will I think be appropriate here.

superblock: it's possible that the proposed three-step scheme will leave
the filesystem's superblock unwritten.  Need to think about that.

So I don't think any additional helper functions are needed.  The above
three are all exported to filesystems.


To test all this I'd suggest that you verify that `-o flush' takes
/rpoc/meminfo:Dirty reliably down to zero on a quiet system for various
operations.


If we're not adding new VFS functions then fatfs can generally hack around
with these functions until it does what we want it to do.  If we _do_ add
new VFS functions then their exact data integrity behaviour should be
carefully designed and documented, please.  Otherwise we'll lose data or
make people's machines slower than they need to be - we have a track record
of screwing this stuff up.

There was quite some duplication of code within the fat patch you sent - it
looked like a little helper function was needed.

I expect the `-o flush' behaviour will be useful on other filesystem types.
Did you consider moving it into the generic mount options (MS_FLUSH?)
rather than making it fat-specific?

Sorry to complicate a little patch, but I'd prefer to get this right
going-in.

iirc, Hirofumi-san had a patch which did the same thing as this one 6-odd
months ago, but it seemed to die off.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-05 19:12     ` Andrew Morton
@ 2006-08-05 20:54       ` OGAWA Hirofumi
  2006-08-07 20:23       ` Chris Mason
  1 sibling, 0 replies; 8+ messages in thread
From: OGAWA Hirofumi @ 2006-08-05 20:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Mason, linux-fsdevel

Andrew Morton <akpm@osdl.org> writes:

>> The basic idea is that someone watching the blinking light on the usb
>> stick knows that when the light goes out, he can pull the stick out.
>
> Yes.

I think it would not be desirable it depends a LED, because some
devices doesn't have a LED.  (e.g. MMC/SD, MS, etc.)

>> It's an alternative to mount -o sync that it closer to what users are
>> asking for.
>> 
>> There's no attempt at integrity, it only tries to keep a fairly constant
>> flow of data to the device.
>
> It's a data-integrity operation, because we want that filesystem to be in
> good shape after the LED goes off.  Not after a pdflush interval of 30
> seconds.
>
> Consequently the semantics which we require of the library functions which
> you've added should be:
>
> /*
>  * On return from this function all dirty data has been cleaned - it is either
>  * on disk or is in flight in the I/O queues.
>  */
>
> These semantics are exactly provided by
> do_sync_file_range(SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE).
> I'd suggest that do_sync_file_range() could be used to sync the file's
> data in this application, if only for reasons of clarity and precision.
>
> That leaves the file metadata, the inode itself and the superblock, for
> which we'd ideally likely same semantics, but stronger sync semantics would
> of course meet our requirement.
>
> metadata: we don't have an
> SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE implementation of
> fsync_buffers_list() so unless we add such, we need to use
> fsync_buffers_list()'s
> (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER)
> semantics.  I'd suggest a call to sync_mapping_buffers() directly.
>
> The inode: write_inode_now(wait=0) will I think be appropriate here.
>
> superblock: it's possible that the proposed three-step scheme will leave
> the filesystem's superblock unwritten.  Need to think about that.

Yes, the superblock is problem. And for it, I'm thinking to add new
"autorw" option.

autorw changes `ro' to `rw' automatically on first write operation.

The daemon is watching it in userland. And it'll change option from
`rw' to `ro' by remount.  With remount, filesystem will flush all
dirty data, and marks it as clean.

So, if filesystem is `ro', user can unplug device.

But, I still don't know whether this works fine or not. If it works,
I'm thinking very very simple "flush" option would be enough.

Thanks.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-05 19:12     ` Andrew Morton
  2006-08-05 20:54       ` OGAWA Hirofumi
@ 2006-08-07 20:23       ` Chris Mason
  2006-08-07 23:58         ` Andrew Morton
  1 sibling, 1 reply; 8+ messages in thread
From: Chris Mason @ 2006-08-07 20:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel, OGAWA Hirofumi

On Sat, Aug 05, 2006 at 12:12:43PM -0700, Andrew Morton wrote:

[ thanks for the comments everyone ]

> Consequently the semantics which we require of the library functions which
> you've added should be:
> 
> /*
>  * On return from this function all dirty data has been cleaned - it is either
>  * on disk or is in flight in the I/O queues.
>  */
> 
> These semantics are exactly provided by
> do_sync_file_range(SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE).
> I'd suggest that do_sync_file_range() could be used to sync the file's
> data in this application, if only for reasons of clarity and precision.

do_sync_file_range is for files.  I don't have files for the directory
inodes.

> 
> That leaves the file metadata, the inode itself and the superblock, for
> which we'd ideally likely same semantics, but stronger sync semantics would
> of course meet our requirement.
> 
> metadata: we don't have an
> SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE implementation of
> fsync_buffers_list() so unless we add such, we need to use
> fsync_buffers_list()'s
> (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER)
> semantics.  I'd suggest a call to sync_mapping_buffers() directly.

Sadly fat doesn't use mark_buffer_dirty_inode.  We can change it to do
so of course.

> 
> The inode: write_inode_now(wait=0) will I think be appropriate here.

The fun part of write_inode_now is the part where __sync_single_inode
does this:

	int wait = wbc->sync_mode == WB_SYNC_ALL;
	...
	if (wait) {
		int err = filemap_fdatawait(mapping);
	...

I forget this every time and had to sysrq-t my way back to this
discovery again today.  This may or may not be the desired result of
write_inode_now, but it is not quite right for -o flush.

> 
> superblock: it's possible that the proposed three-step scheme will leave
> the filesystem's superblock unwritten.  Need to think about that.
> 
> So I don't think any additional helper functions are needed.  The above
> three are all exported to filesystems.
> 
> 
> To test all this I'd suggest that you verify that `-o flush' takes
> /rpoc/meminfo:Dirty reliably down to zero on a quiet system for various
> operations.
> 
My test:

cp -a /usr/share/doc/pages fat_fs/orig
cd fat_fs ; cp -a orig copy
sleep 2
reset

Then diff the directories after things come back.

> 
> If we're not adding new VFS functions then fatfs can generally hack around
> with these functions until it does what we want it to do.  If we _do_ add
> new VFS functions then their exact data integrity behaviour should be
> carefully designed and documented, please.  Otherwise we'll lose data or
> make people's machines slower than they need to be - we have a track record
> of screwing this stuff up.
> 
> There was quite some duplication of code within the fat patch you sent - it
> looked like a little helper function was needed.

New patch below, it is slightly different.  I'm still hoping we can
avoid the timer to cover in flight ios ignored by filemap_flush, but it
may come to that later on.

> 
> I expect the `-o flush' behaviour will be useful on other filesystem types.
> Did you consider moving it into the generic mount options (MS_FLUSH?)
> rather than making it fat-specific?

I did, but this is one of those fuzzy patches.  It's not actually a good
idea, but it's the best compromise I can find between what the users
wanted and what I was willing to ship.

If other filesystems what this bad idea, we can certainly add it, but I
wanted to start small...

> 
> Sorry to complicate a little patch, but I'd prefer to get this right
> going-in.
> 
> iirc, Hirofumi-san had a patch which did the same thing as this one 6-odd
> months ago, but it seemed to die off.
> 

The ones I found 6-odd months ago didn't cover metadata correctly.
His stuff may have improved, sorry if I missed it.

From: Chris Mason <mason@suse.com>
Subject: add -o flush for fat

Fat is commonly used on removable media. Mounting with -o flush tells the
FS to write things to disk as quickly as possible.  It is like -o sync, but
much faster (and not as safe).

Signed-off-by: Chris Mason <mason@suse.com>

---
diff -r 8a50d9b6ce6f fs/fat/file.c
--- a/fs/fat/file.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/fat/file.c	Mon Aug 07 15:45:42 2006 -0400
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -112,6 +113,16 @@ int fat_generic_ioctl(struct inode *inod
 	}
 }
 
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_mode & FMODE_WRITE) &&
+	     MSDOS_SB(inode->i_sb)->options.flush) {
+		fat_flush_inodes(inode->i_sb, inode, NULL);
+		blk_congestion_wait(WRITE, HZ/10);
+	}
+	return 0;
+}
+
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -121,6 +132,7 @@ const struct file_operations fat_file_op
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
+	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
 	.fsync		= file_fsync,
 	.sendfile	= generic_file_sendfile,
@@ -289,6 +301,7 @@ void fat_truncate(struct inode *inode)
 	lock_kernel();
 	fat_free(inode, nr_clusters);
 	unlock_kernel();
+	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
 struct inode_operations fat_file_inode_operations = {
diff -r 8a50d9b6ce6f fs/fat/inode.c
--- a/fs/fat/inode.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/fat/inode.c	Mon Aug 07 16:19:59 2006 -0400
@@ -24,6 +24,7 @@
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
+#include <linux/writeback.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -861,7 +862,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_err,
 };
 
 static match_table_t fat_tokens = {
@@ -893,7 +894,8 @@ static match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_format=%20s"},
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
-	{Opt_err, NULL}
+	{Opt_flush, "flush"},
+	{Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
 	{Opt_nodots, "nodots"},
@@ -1034,6 +1036,9 @@ static int parse_options(char *options, 
 				return 0;
 			opts->codepage = option;
 			break;
+		case Opt_flush:
+			opts->flush = 1;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
@@ -1435,6 +1440,56 @@ out_fail:
 
 EXPORT_SYMBOL_GPL(fat_fill_super);
 
+/*
+ * helper function for fat_flush_inodes.  This writes both the inode
+ * and the file data blocks, waiting for in flight data blocks before
+ * the start of the call.  It does not wait for any io started
+ * during the call
+ */
+static int writeback_inode(struct inode *inode)
+{
+
+	int ret;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = {
+	       .sync_mode = WB_SYNC_NONE,
+	      .nr_to_write = 0,
+	};
+	/* if we used WB_SYNC_ALL, sync_inode waits for the io for the
+	* inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+	* and filemap_fdatawrite is used for the data blocks
+	*/
+	ret = sync_inode(inode, &wbc);
+	if (!ret)
+	       ret = filemap_fdatawrite(mapping);
+	return ret;
+}
+
+/*
+ * write data and metadata corresponding to i1 and i2.  The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again
+ */
+int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
+{
+	int ret = 0;
+	if (!MSDOS_SB(sb)->options.flush)
+		return 0;
+	if (i1)
+		ret = writeback_inode(i1);
+	if (!ret && i2)
+		ret = writeback_inode(i2);
+	if (!ret && sb) {
+		struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+		ret = filemap_flush(mapping);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fat_flush_inodes);
+
 static int __init init_fat_fs(void)
 {
 	int err;
diff -r 8a50d9b6ce6f fs/msdos/namei.c
--- a/fs/msdos/namei.c	Fri Aug 04 14:02:26 2006 -0400
+++ b/fs/msdos/namei.c	Mon Aug 07 14:35:17 2006 -0400
@@ -280,7 +280,7 @@ static int msdos_create(struct inode *di
 			struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct fat_slot_info sinfo;
 	struct timespec ts;
 	unsigned char msdos_name[MSDOS_NAME];
@@ -316,6 +316,8 @@ static int msdos_create(struct inode *di
 	d_instantiate(dentry, inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
 	return err;
 }
 
@@ -348,6 +350,8 @@ static int msdos_rmdir(struct inode *dir
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(inode->i_sb, dir, inode);
 
 	return err;
 }
@@ -401,6 +405,7 @@ static int msdos_mkdir(struct inode *dir
 	d_instantiate(dentry, inode);
 
 	unlock_kernel();
+	fat_flush_inodes(sb, dir, inode);
 	return 0;
 
 out_free:
@@ -430,6 +435,8 @@ static int msdos_unlink(struct inode *di
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(inode->i_sb, dir, inode);
 
 	return err;
 }
@@ -635,6 +642,8 @@ static int msdos_rename(struct inode *ol
 			      new_dir, new_msdos_name, new_dentry, is_hid);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
 	return err;
 }
 
diff -r 8a50d9b6ce6f include/linux/msdos_fs.h
--- a/include/linux/msdos_fs.h	Fri Aug 04 14:02:26 2006 -0400
+++ b/include/linux/msdos_fs.h	Mon Aug 07 14:35:17 2006 -0400
@@ -204,6 +204,7 @@ struct fat_mount_options {
 		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
 		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
 		 atari:1,         /* Use Atari GEMDOS variation of MS-DOS fs */
+		 flush:1,	  /* write things quickly */
 		 nocase:1;	  /* Does this need case conversion? 0=need case conversion*/
 };
 
@@ -412,6 +413,8 @@ extern int fat_fill_super(struct super_b
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 			struct inode_operations *fs_dir_inode_ops, int isvfat);
 
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
 /* fat/misc.c */
 extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
 extern void fat_clusters_flush(struct super_block *sb);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-07 20:23       ` Chris Mason
@ 2006-08-07 23:58         ` Andrew Morton
  2006-08-08  8:05           ` Chris Mason
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2006-08-07 23:58 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel, OGAWA Hirofumi

On Mon, 7 Aug 2006 16:23:55 -0400
Chris Mason <mason@suse.com> wrote:

> Fat is commonly used on removable media. Mounting with -o flush tells the
> FS to write things to disk as quickly as possible.  It is like -o sync, but
> much faster (and not as safe).
> 

OK, so it's now fat-specific.  That makes it easier, and still useful.

> @@ -112,6 +113,16 @@ int fat_generic_ioctl(struct inode *inod
>  	}
>  }
>  
> +static int fat_file_release(struct inode *inode, struct file *filp)
> +{
> +	if ((filp->f_mode & FMODE_WRITE) &&
> +	     MSDOS_SB(inode->i_sb)->options.flush) {
> +		fat_flush_inodes(inode->i_sb, inode, NULL);
> +		blk_congestion_wait(WRITE, HZ/10);
> +	}
> +	return 0;
> +}

What's the blk_congestion_wait() for?



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] add -o flush for fat
  2006-08-07 23:58         ` Andrew Morton
@ 2006-08-08  8:05           ` Chris Mason
  0 siblings, 0 replies; 8+ messages in thread
From: Chris Mason @ 2006-08-08  8:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel, OGAWA Hirofumi

On Mon, Aug 07, 2006 at 04:58:20PM -0700, Andrew Morton wrote:
> On Mon, 7 Aug 2006 16:23:55 -0400
> Chris Mason <mason@suse.com> wrote:
> 
> > Fat is commonly used on removable media. Mounting with -o flush tells the
> > FS to write things to disk as quickly as possible.  It is like -o sync, but
> > much faster (and not as safe).
> > 
> 
> OK, so it's now fat-specific.  That makes it easier, and still useful.
> 
> > @@ -112,6 +113,16 @@ int fat_generic_ioctl(struct inode *inod
> >  	}
> >  }
> >  
> > +static int fat_file_release(struct inode *inode, struct file *filp)
> > +{
> > +	if ((filp->f_mode & FMODE_WRITE) &&
> > +	     MSDOS_SB(inode->i_sb)->options.flush) {
> > +		fat_flush_inodes(inode->i_sb, inode, NULL);
> > +		blk_congestion_wait(WRITE, HZ/10);
> > +	}
> > +	return 0;
> > +}
> 
> What's the blk_congestion_wait() for?

It's just some black magic to throttle.  For sufficiently large files we
really want to throttle during the write, but for most average cases it
helps the stick keep up.

You still need to watch the blinking lights either way.

-chris

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2006-08-08  8:05 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-04 19:27 [PATCH] add -o flush for fat Chris Mason
2006-08-05  1:31 ` Andrew Morton
2006-08-05 12:26   ` Chris Mason
2006-08-05 19:12     ` Andrew Morton
2006-08-05 20:54       ` OGAWA Hirofumi
2006-08-07 20:23       ` Chris Mason
2006-08-07 23:58         ` Andrew Morton
2006-08-08  8:05           ` Chris Mason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).