[PATCH] 2.5.x write_super is not for syncing

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 2.5.x write_super is not for syncing
@ 2002-10-16 18:00 Chris Mason
  2002-12-02 22:07 ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Chris Mason @ 2002-10-16 18:00 UTC (permalink / raw)
  To: linux-kernel

Hello all,

This patch adds a commit_super super operation that allows the journaled
filesystems to do something different for the periodic write_super calls
and sync.

Based on comments from Al about my last patch, alloc_super sets a
default empty super_operations struct on each super.  This allows us to
get rid of all checks for sb->s_ops == NULL.

sync_supers is changed so that it doesn't loop on a single FS if the
write_super call leaves sb->s_dirt set.  I did this by changing
generic_shutdown_super to use list_del_init(&sb->s_list), which allows
us to check for supers that have been removed from the super_blocks list
while we slept.  The idea came from an sgi patch Hugh Dickins sent me.

Anyway, this is against 2.5.43, please review:

--- 1.161/fs/buffer.c	Tue Oct  8 14:40:47 2002
+++ edited/fs/buffer.c	Wed Oct 16 11:39:04 2002
@@ -217,8 +217,10 @@
 	sync_inodes_sb(sb, 0);
 	DQUOT_SYNC(sb);
 	lock_super(sb);
-	if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
+	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
+	if (sb->s_dirt && sb->s_op->commit_super)
+		sb->s_op->commit_super(sb);
 	unlock_super(sb);
 	sync_blockdev(sb->s_bdev);
 	sync_inodes_sb(sb, 1);
@@ -251,7 +253,7 @@
 	wakeup_bdflush(0);
 	sync_inodes(0);	/* All mappings and inodes, including block devices */
 	DQUOT_SYNC(NULL);
-	sync_supers();	/* Write the superblocks */
+	commit_supers();	/* Write the superblocks */
 	sync_inodes(1);	/* All the mappings and inodes, again. */
 	return 0;
 }
@@ -274,7 +276,7 @@
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
 	lock_super(sb);
-	if (sb->s_op && sb->s_op->write_super)
+	if (sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 
--- 1.22/fs/fs-writeback.c	Sun Sep 22 17:26:49 2002
+++ edited/fs/fs-writeback.c	Wed Oct 16 13:29:51 2002
@@ -57,7 +57,7 @@
 	 * dirty the inode itself
 	 */
 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		if (sb->s_op && sb->s_op->dirty_inode)
+		if (sb->s_op->dirty_inode)
 			sb->s_op->dirty_inode(inode);
 	}
 
@@ -103,8 +103,7 @@
 
 static void write_inode(struct inode *inode, int sync)
 {
-	if (inode->i_sb->s_op && inode->i_sb->s_op->write_inode &&
-			!is_bad_inode(inode))
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
 		inode->i_sb->s_op->write_inode(inode, sync);
 }
 
--- 1.83/fs/super.c	Mon Sep  9 17:00:57 2002
+++ edited/fs/super.c	Wed Oct 16 10:36:09 2002
@@ -48,6 +48,8 @@
  */
 static struct super_block *alloc_super(void)
 {
+	static struct super_operations default_op = {};
+
 	struct super_block *s = kmalloc(sizeof(struct super_block),  GFP_USER);
 	if (s) {
 		memset(s, 0, sizeof(struct super_block));
@@ -72,6 +74,7 @@
 		s->s_maxbytes = MAX_NON_LFS;
 		s->dq_op = sb_dquot_ops;
 		s->s_qcop = sb_quotactl_ops;
+		s->s_op = &default_op;
 	}
 out:
 	return s;
@@ -203,7 +206,12 @@
 		unlock_super(sb);
 	}
 	spin_lock(&sb_lock);
-	list_del(&sb->s_list);
+
+	/* 
+	 * use list_del_init so we can tell later if a super with an
+	 * incremented count has been removed from all lists
+	 */
+	list_del_init(&sb->s_list);
 	list_del(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
@@ -267,35 +275,65 @@
 {
 	lock_super(sb);
 	if (sb->s_root && sb->s_dirt)
-		if (sb->s_op && sb->s_op->write_super)
+		if (sb->s_op->write_super)
 			sb->s_op->write_super(sb);
 	unlock_super(sb);
 }
 
+static inline void commit_super(struct super_block *sb)
+{
+	lock_super(sb);
+	if (sb->s_root && sb->s_dirt) {
+		if (sb->s_op->write_super)
+			sb->s_op->write_super(sb);
+		if (sb->s_op->commit_super)
+			sb->s_op->commit_super(sb);
+	}
+	unlock_super(sb);
+}
+
 /*
  * Note: check the dirty flag before waiting, so we don't
  * hold up the sync while mounting a device. (The newly
  * mounted device won't need syncing.)
  */
-void sync_supers(void)
+void dirty_super_op(void (*func)(struct super_block *))
 {
 	struct super_block * sb;
-restart:
 	spin_lock(&sb_lock);
+restart:
 	sb = sb_entry(super_blocks.next);
-	while (sb != sb_entry(&super_blocks))
+	while (sb != sb_entry(&super_blocks)) {
 		if (sb->s_dirt) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
-			write_super(sb);
-			drop_super(sb);
-			goto restart;
-		} else
-			sb = sb_entry(sb->s_list.next);
+			func(sb);
+			up_read(&sb->s_umount);
+
+			spin_lock(&sb_lock);
+			if (!--sb->s_count) {
+				destroy_super(sb);
+				goto restart;
+			} else if (list_empty(&sb->s_list)) {
+				goto restart;
+			}
+		} 
+		sb = sb_entry(sb->s_list.next);
+	}
 	spin_unlock(&sb_lock);
 }
 
+void sync_supers(void)
+{
+    dirty_super_op(write_super);
+}
+
+void commit_supers(void)
+{
+    dirty_super_op(commit_super);
+}
+
 /**
  *	get_super	-	get the superblock of a device
  *	@dev: device to get the superblock for
@@ -396,7 +434,7 @@
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
 		if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-	if (sb->s_op && sb->s_op->remount_fs) {
+	if (sb->s_op->remount_fs) {
 		lock_super(sb);
 		retval = sb->s_op->remount_fs(sb, &flags, data);
 		unlock_super(sb);
--- 1.170/include/linux/fs.h	Fri Oct 11 04:49:46 2002
+++ edited/include/linux/fs.h	Wed Oct 16 10:42:16 2002
@@ -818,6 +818,7 @@
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
+	void (*commit_super) (struct super_block *);
 };
 
 /* Inode state bits.  Protected by inode_lock. */
@@ -1141,6 +1142,7 @@
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
 extern void sync_supers(void);
+extern void commit_supers(void);
 extern sector_t bmap(struct inode *, sector_t);
 extern int notify_change(struct dentry *, struct iattr *);
 extern int permission(struct inode *, int);





^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-10-16 18:00 [PATCH] 2.5.x write_super is not for syncing Chris Mason
@ 2002-12-02 22:07 ` Andrew Morton
  2002-12-02 23:40   ` Chris Mason
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-02 22:07 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org

Six weeks ago, Chris Mason wrote:
> 
> Hello all,
> 
> This patch adds a commit_super super operation that allows the journaled
> filesystems to do something different for the periodic write_super calls
> and sync.

This is the "right fix" for the ext3 data=journal problem.

> Based on comments from Al about my last patch, alloc_super sets a
> default empty super_operations struct on each super.  This allows us to
> get rid of all checks for sb->s_ops == NULL.
> 
> sync_supers is changed so that it doesn't loop on a single FS if the
> write_super call leaves sb->s_dirt set.  I did this by changing
> generic_shutdown_super to use list_del_init(&sb->s_list), which allows
> us to check for supers that have been removed from the super_blocks list
> while we slept.  The idea came from an sgi patch Hugh Dickins sent me.
> 
> Anyway, this is against 2.5.43, please review:
> 
> --- 1.161/fs/buffer.c   Tue Oct  8 14:40:47 2002
> +++ edited/fs/buffer.c  Wed Oct 16 11:39:04 2002
> @@ -217,8 +217,10 @@
>         sync_inodes_sb(sb, 0);
>         DQUOT_SYNC(sb);
>         lock_super(sb);
> -       if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
> +       if (sb->s_dirt && sb->s_op->write_super)
>                 sb->s_op->write_super(sb);
> +       if (sb->s_dirt && sb->s_op->commit_super)
> +               sb->s_op->commit_super(sb);

Except for the s_dirt test in here.  If s_dirt is zero and we
have dirty inodes, the filesystem _still_ is not told what to
do.

I don't think we'll ever get this right until we start telling the
filesystem what's happening.  So instead of all these little
presumptuous micro-syncs which we're doing in there, we need to turn
this inside out and just call sb->s_op->sync_everything_for_umount()
and let the fs decide how to get everything tight on disk.

That's a bit drastic.  At a minimum we need to remove that s_dirt
test and make the commit_super() call unconditional.

What would that break?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-02 22:07 ` Andrew Morton
@ 2002-12-02 23:40   ` Chris Mason
  2002-12-03  0:03     ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Chris Mason @ 2002-12-02 23:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel@vger.kernel.org

On Mon, 2002-12-02 at 17:07, Andrew Morton wrote:

> Except for the s_dirt test in here.  If s_dirt is zero and we
> have dirty inodes, the filesystem _still_ is not told what to
> do.

Perhaps dirty inodes imply s_dirt should be true?

> 
> I don't think we'll ever get this right until we start telling the
> filesystem what's happening.  So instead of all these little
> presumptuous micro-syncs which we're doing in there, we need to turn
> this inside out and just call sb->s_op->sync_everything_for_umount()
> and let the fs decide how to get everything tight on disk.
> 
> That's a bit drastic.  At a minimum we need to remove that s_dirt
> test and make the commit_super() call unconditional.
> 
> What would that break?

Not much, since foofs_commit_super could always check for s_dirt if the
local FS really cared.

The downside is that since we don't check for s_dirt in commit_supers(),
the FS might get sunk twice if the we need to restart at the head of the
supers list due to unmount.  But, a double sync is possible anyway under
FS load in the same situation.

-chris

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-02 23:40   ` Chris Mason
@ 2002-12-03  0:03     ` Andrew Morton
  2002-12-03  1:10       ` Chris Mason
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-03  0:03 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org

Chris Mason wrote:
> 
> On Mon, 2002-12-02 at 17:07, Andrew Morton wrote:
> 
> > Except for the s_dirt test in here.  If s_dirt is zero and we
> > have dirty inodes, the filesystem _still_ is not told what to
> > do.
> 
> Perhaps dirty inodes imply s_dirt should be true?

Well afaik (why doesn't anybody ever comment stuff?  Do they prefer
buggy software?) s_dirt purely means "the superblock needs to be
written out".

> >
> > I don't think we'll ever get this right until we start telling the
> > filesystem what's happening.  So instead of all these little
> > presumptuous micro-syncs which we're doing in there, we need to turn
> > this inside out and just call sb->s_op->sync_everything_for_umount()
> > and let the fs decide how to get everything tight on disk.
> >
> > That's a bit drastic.  At a minimum we need to remove that s_dirt
> > test and make the commit_super() call unconditional.
> >
> > What would that break?
> 
> Not much, since foofs_commit_super could always check for s_dirt if the
> local FS really cared.
> 
> The downside is that since we don't check for s_dirt in commit_supers(),
> the FS might get sunk twice if the we need to restart at the head of the
> supers list due to unmount.  But, a double sync is possible anyway under
> FS load in the same situation.
> 

But you do check:

+static inline void commit_super(struct super_block *sb)
+{
+       lock_super(sb);
+       if (sb->s_root && sb->s_dirt) {
+               if (sb->s_op->write_super)
+                       sb->s_op->write_super(sb);
+               if (sb->s_op->commit_super)
+                       sb->s_op->commit_super(sb);
+       }
+       unlock_super(sb);
+}
+

I'm thinking that for this particular problem, the solution isn't
quite appropriate.  We don't really give a toss about the dirty
state of the superblock.  We want to sync the darn filesystem.
Sure, we can do that in ->commit_super(), but it really has nothing
to do with the superblock.

What about something along these lines?


--- 25/fs/buffer.c~sync-fs	Mon Dec  2 16:00:18 2002
+++ 25-akpm/fs/buffer.c	Mon Dec  2 16:02:37 2002
@@ -213,7 +213,7 @@ EXPORT_SYMBOL(sync_blockdev);
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int fsync_super(struct super_block *sb)
+static int __sync_filesystem(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	DQUOT_SYNC(sb);
@@ -225,6 +225,13 @@ int fsync_super(struct super_block *sb)
 	sync_inodes_sb(sb, 1);
 
 	return sync_blockdev(sb->s_bdev);
+}
+
+int sync_filesystem(struct super_block *sb)
+{
+	if (sb->s_op && sb->s_op->sync_filesystem)
+		return sb->s_op->sync_filesystem(sb);
+	return __sync_filesytem(sb);
 }
 
 /*

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03  0:03     ` Andrew Morton
@ 2002-12-03  1:10       ` Chris Mason
  2002-12-03  1:40         ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Chris Mason @ 2002-12-03  1:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel@vger.kernel.org

On Mon, 2002-12-02 at 19:03, Andrew Morton wrote:
> Chris Mason wrote:
> > 
> > On Mon, 2002-12-02 at 17:07, Andrew Morton wrote:
> > 
> > > Except for the s_dirt test in here.  If s_dirt is zero and we
> > > have dirty inodes, the filesystem _still_ is not told what to
> > > do.
> > 
> > Perhaps dirty inodes imply s_dirt should be true?
> 
> Well afaik (why doesn't anybody ever comment stuff?  Do they prefer
> buggy software?) s_dirt purely means "the superblock needs to be
> written out".

Sorry, neither of my comments were horribly clear.  Regardless of
s_dirt's current definition, s_dirt could mean 'something on the FS
needs to be written, and only the FS knows how to do it', or some other
wording that covers both the 'call me for periodic writes' case and the
'call me on sync()' case.  

I'm not saying we really need to muddy the s_dirt definition like that,
it's just an option in case someone really wants s_dirt checks
everywhere.

> > > That's a bit drastic.  At a minimum we need to remove that s_dirt
> > > test and make the commit_super() call unconditional.
> > >
> > > What would that break?
> > 
> > Not much, since foofs_commit_super could always check for s_dirt if the
> > local FS really cared.
> > 
> > The downside is that since we don't check for s_dirt in commit_supers(),
> > the FS might get sunk twice if the we need to restart at the head of the
> > supers list due to unmount.  But, a double sync is possible anyway under
> > FS load in the same situation.
> > 
> 
> But you do check:

Right, I meant that if we change the patch to remove the s_dirt check
from the commit_supers() function, a single FS might get sunk twice. 
That risk already exists when the FS is under load though, so I'm not
horribly worried about it.

> I'm thinking that for this particular problem, the solution isn't
> quite appropriate.  We don't really give a toss about the dirty
> state of the superblock.  We want to sync the darn filesystem.
> Sure, we can do that in ->commit_super(), but it really has nothing
> to do with the superblock.
> 
> What about something along these lines?

You'd still want sync_filesystem to get calls in the sys_sync case
though, right?  I'd be happy to rework the commit_super patch into a
sync_filesystem patch that doesn't bother with s_dirt except when
calling write_super().

The patch has two parts that matter in my mind:

1) In sync_supers (and other similar funcs), allow the FS to leave
s_dirt set.  This way we can setup per FS timings on when periodic
writes get done.

2) Give the FS a way to tell the difference between periodic writes that
don't need to force and wait for a commit, and a request from the user
that does require a full commit.

-chris

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03  1:10       ` Chris Mason
@ 2002-12-03  1:40         ` Andrew Morton
  2002-12-03  3:09           ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-03  1:40 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org

Chris Mason wrote:
> 
> ...
> > What about something along these lines?
> 
> You'd still want sync_filesystem to get calls in the sys_sync case
> though, right?

yes.

>  I'd be happy to rework the commit_super patch into a
> sync_filesystem patch that doesn't bother with s_dirt except when
> calling write_super().
> 
> The patch has two parts that matter in my mind:
> 
> 1) In sync_supers (and other similar funcs), allow the FS to leave
> s_dirt set.  This way we can setup per FS timings on when periodic
> writes get done.

hm.  Is that the simplest possible way of doing such a thing?
 
> 2) Give the FS a way to tell the difference between periodic writes that
> don't need to force and wait for a commit, and a request from the user
> that does require a full commit.

yup.  The whole sync thing is a mess, really.  It's designed for ext2,
and other filesystems just graft stuff on top of that, doing magical
things by inference.

It needs a complete rethink, IMO.  And I'm not really inclined to
do it in the 2.5 timeframe, mainly because the stuff we have now
mostly works, and everybody's busy on other stuff.

And we need a 2.4 solution, asap.  These two patches are what I'll
be testing tonight for 2.5.

 Documentation/filesystems/Locking |    7 +++++++
 fs/buffer.c                       |   21 +++++++++++++++++++--
 fs/dquot.c                        |    1 +
 fs/super.c                        |    5 ++---
 include/linux/buffer_head.h       |    2 --
 include/linux/fs.h                |    3 +++
 include/linux/writeback.h         |    1 -
 kernel/ksyms.c                    |    1 +
 8 files changed, 33 insertions(+), 8 deletions(-)

--- 25/fs/buffer.c~sync-fs	Mon Dec  2 16:24:34 2002
+++ 25-akpm/fs/buffer.c	Mon Dec  2 16:51:04 2002
@@ -213,7 +213,7 @@ EXPORT_SYMBOL(sync_blockdev);
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int fsync_super(struct super_block *sb)
+static int __sync_filesystem(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	DQUOT_SYNC(sb);
@@ -227,6 +227,23 @@ int fsync_super(struct super_block *sb)
 	return sync_blockdev(sb->s_bdev);
 }
 
+int sync_filesystem(struct super_block *sb)
+{
+	int ret;
+	int cflags = current->flags;
+
+	current->flags |= PF_SYNC;	/* ick.  This is a "data integrity"
+					 * operation
+					 */
+
+	if (sb->s_op && sb->s_op->sync_filesystem)
+		ret = sb->s_op->sync_filesystem(sb);
+	else
+		ret = __sync_filesystem(sb);
+	current->flags = cflags;
+	return ret;
+}
+
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
@@ -236,7 +253,7 @@ int fsync_bdev(struct block_device *bdev
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
-		int res = fsync_super(sb);
+		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
--- 25/include/linux/buffer_head.h~sync-fs	Mon Dec  2 16:24:34 2002
+++ 25-akpm/include/linux/buffer_head.h	Mon Dec  2 16:24:34 2002
@@ -155,8 +155,6 @@ void __wait_on_buffer(struct buffer_head
 void sleep_on_buffer(struct buffer_head *bh);
 void wake_up_buffer(struct buffer_head *bh);
 int fsync_bdev(struct block_device *);
-int fsync_super(struct super_block *);
-int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
 struct buffer_head * __getblk(struct block_device *, sector_t, int);
 void __brelse(struct buffer_head *);
--- 25/include/linux/fs.h~sync-fs	Mon Dec  2 16:24:34 2002
+++ 25-akpm/include/linux/fs.h	Mon Dec  2 16:44:00 2002
@@ -816,6 +816,7 @@ struct super_operations {
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
+	int (*sync_filesystem)(struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
 };
@@ -1311,6 +1312,8 @@ extern struct dentry *simple_lookup(stru
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
 extern struct file_operations simple_dir_operations;
 extern struct inode_operations simple_dir_inode_operations;
+extern int sync_filesystem(struct super_block *sb);
+extern void sync_inodes_sb(struct super_block *, int wait);
 
 #ifdef CONFIG_BLK_DEV_INITRD
 extern unsigned int real_root_dev;
--- 25/fs/super.c~sync-fs	Mon Dec  2 16:24:34 2002
+++ 25-akpm/fs/super.c	Mon Dec  2 16:24:34 2002
@@ -27,7 +27,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
@@ -180,7 +179,7 @@ void generic_shutdown_super(struct super
 		shrink_dcache_parent(root);
 		shrink_dcache_anon(&sb->s_anon);
 		dput(root);
-		fsync_super(sb);
+		sync_filesystem(sb);
 		lock_super(sb);
 		lock_kernel();
 		sb->s_flags &= ~MS_ACTIVE;
@@ -391,7 +390,7 @@ int do_remount_sb(struct super_block *sb
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	fsync_super(sb);
+	sync_filesystem(sb);
 	/* If we are remounting RDONLY, make sure there are no rw files open */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
 		if (!fs_may_remount_ro(sb))
--- 25/Documentation/filesystems/Locking~sync-fs	Mon Dec  2 16:27:20 2002
+++ 25-akpm/Documentation/filesystems/Locking	Mon Dec  2 16:28:56 2002
@@ -96,6 +96,7 @@ prototypes:
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
+	int (*sync_filesystem)(struct super_block *);
 
 locking rules:
 	All may block.
@@ -111,11 +112,17 @@ write_super:	no	yes	maybe		(see below)
 statfs:		no	no	no
 remount_fs:	yes	yes	maybe		(see below)
 umount_begin:	yes	no	maybe		(see below)
+sync_filesystem:no      no      ??
 
 ->read_inode() is not a method - it's a callback used in iget().
 rules for mount_sem are not too nice - it is going to die and be replaced
 by better scheme anyway.
 
+sync_filesystem() must commit all pending data (and metadata) to disk,
+and wait on its writeout.  It is a full sync of everything in that fs,
+period.
+
+
 --------------------------- file_system_type ---------------------------
 prototypes:
 	struct super_block *(*get_sb) (struct file_system_type *, int, char *, void *);
--- 25/include/linux/writeback.h~sync-fs	Mon Dec  2 16:43:32 2002
+++ 25-akpm/include/linux/writeback.h	Mon Dec  2 16:43:51 2002
@@ -50,7 +50,6 @@ struct writeback_control {
 void writeback_inodes(struct writeback_control *wbc);
 void wake_up_inode(struct inode *inode);
 void __wait_on_inode(struct inode * inode);
-void sync_inodes_sb(struct super_block *, int wait);
 void sync_inodes(int wait);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
--- 25/kernel/ksyms.c~sync-fs	Mon Dec  2 16:56:24 2002
+++ 25-akpm/kernel/ksyms.c	Mon Dec  2 16:57:37 2002
@@ -546,6 +546,7 @@ EXPORT_SYMBOL(buffer_insert_list);
 EXPORT_SYMBOL(make_bad_inode);
 EXPORT_SYMBOL(is_bad_inode);
 EXPORT_SYMBOL(__inode_dir_notify);
+EXPORT_SYMBOL(sync_inodes_sb);
 
 #ifdef CONFIG_UID16
 EXPORT_SYMBOL(overflowuid);
--- 25/fs/dquot.c~sync-fs	Mon Dec  2 16:58:07 2002
+++ 25-akpm/fs/dquot.c	Mon Dec  2 16:58:24 2002
@@ -1425,3 +1425,4 @@ EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_list_lock);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(sync_dquots);

_


 fs/ext3/super.c |   85 ++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 61 insertions(+), 24 deletions(-)

--- 25/fs/ext3/super.c~ext3-sync-fs	Mon Dec  2 17:00:20 2002
+++ 25-akpm/fs/ext3/super.c	Mon Dec  2 17:15:53 2002
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "xattr.h"
 #include "acl.h"
@@ -47,6 +48,7 @@ static void ext3_mark_recovery_complete(
 					struct ext3_super_block * es);
 static void ext3_clear_journal_err(struct super_block * sb,
 				   struct ext3_super_block * es);
+static int ext3_sync_filesystem(struct super_block *sb);
 
 #ifdef CONFIG_JBD_DEBUG
 int journal_no_write[2];
@@ -495,18 +497,19 @@ static void ext3_clear_inode(struct inod
 static struct super_operations ext3_sops = {
 	.alloc_inode	= ext3_alloc_inode,
 	.destroy_inode	= ext3_destroy_inode,
-	.read_inode	= ext3_read_inode,	/* BKL held */
-	.write_inode	= ext3_write_inode,	/* BKL not held.  Don't need */
-	.dirty_inode	= ext3_dirty_inode,	/* BKL not held.  We take it */
-	.put_inode	= ext3_put_inode,		/* BKL not held.  Don't need */
-	.delete_inode	= ext3_delete_inode,	/* BKL not held.  We take it */
-	.put_super	= ext3_put_super,		/* BKL held */
-	.write_super	= ext3_write_super,	/* BKL not held. We take it. Needed? */
-	.write_super_lockfs = ext3_write_super_lockfs, /* BKL not held. Take it */
-	.unlockfs	= ext3_unlockfs,		/* BKL not held.  We take it */
-	.statfs		= ext3_statfs,		/* BKL not held. */
-	.remount_fs	= ext3_remount,		/* BKL held */
-	.clear_inode	= ext3_clear_inode,	/* BKL not needed. */
+	.read_inode	= ext3_read_inode,
+	.write_inode	= ext3_write_inode,
+	.dirty_inode	= ext3_dirty_inode,
+	.put_inode	= ext3_put_inode,
+	.delete_inode	= ext3_delete_inode,
+	.put_super	= ext3_put_super,
+	.write_super	= ext3_write_super,
+	.sync_filesystem= ext3_sync_filesystem,
+	.write_super_lockfs = ext3_write_super_lockfs,
+	.unlockfs	= ext3_unlockfs,
+	.statfs		= ext3_statfs,
+	.remount_fs	= ext3_remount,
+	.clear_inode	= ext3_clear_inode,
 };
 
 struct dentry *ext3_get_parent(struct dentry *child);
@@ -1714,25 +1717,59 @@ int ext3_force_commit(struct super_block
  * This implicitly triggers the writebehind on sync().
  */
 
-static int do_sync_supers = 0;
-MODULE_PARM(do_sync_supers, "i");
-MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
-
 void ext3_write_super (struct super_block * sb)
 {
-	tid_t target;
 	lock_kernel();	
 	if (down_trylock(&sb->s_lock) == 0)
-		BUG();		/* aviro detector */
+		BUG();
 	sb->s_dirt = 0;
-	target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	unlock_kernel();
+}
 
-	if (do_sync_supers) {
-		unlock_super(sb);
-		log_wait_commit(EXT3_SB(sb)->s_journal, target);
-		lock_super(sb);
-	}
+static int ext3_sync_filesystem(struct super_block *sb)
+{
+	tid_t target;
+
+	/*
+	 * The pagecache flush may instantiate new indirects and
+	 * may dirty inodes.
+	 */
+	sync_inodes_sb(sb, 0);
+
+	/*
+	 * Quota sync may write to files
+	 */
+	DQUOT_SYNC(sb);
+
+	/*
+	 * So sync the files again
+	 */
+	sync_inodes_sb(sb, 0);
+
+	/*
+	 * Now run a commit, and wait on it.
+	 */
+	lock_kernel();	
+	sb->s_dirt = 0;
+	target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	log_wait_commit(EXT3_SB(sb)->s_journal, target);
 	unlock_kernel();
+
+	/*
+	 * For data=journal inodes, the commit may have generated new dirty
+	 * buffers.  Their metadata is all tight on disk, so just flush the
+	 * pages again.  And wait on them.
+	 */
+	sync_inodes_sb(sb, 0);		/* Nicer IO scheduling */
+	sync_inodes_sb(sb, 1);		/* Wait here */
+
+	/*
+	 * And we know that the only dirty buffers will be against the blockdev
+	 * mapping, so write them and wait on them.
+	 */
+	sync_blockdev(sb->s_bdev);
+	return 0;
 }
 
 /*

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03  1:40         ` Andrew Morton
@ 2002-12-03  3:09           ` Andrew Morton
  2002-12-03 19:36             ` Bryan Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-03  3:09 UTC (permalink / raw)
  To: Chris Mason, linux-fsdevel@vger.kernel.org, Stephen C. Tweedie

Andrew Morton wrote:
> 
> These two patches are what I'll be testing tonight for 2.5.
> 

I don't feel inclined to take on the whole sync thing at present.
So a minimal fix for this problem is as follows.  Against 2.4.20:

 fs/buffer.c        |    2 ++
 fs/super.c         |    4 ++++
 include/linux/fs.h |    1 +
 3 files changed, 7 insertions(+)

--- linux-akpm/fs/buffer.c~commit-fs	Mon Dec  2 18:40:03 2002
+++ linux-akpm-akpm/fs/buffer.c	Mon Dec  2 18:41:34 2002
@@ -327,6 +327,8 @@ int fsync_super(struct super_block *sb)
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
+	if (sb->s_op && sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb);
 	unlock_super(sb);
 	unlock_kernel();
 
--- linux-akpm/include/linux/fs.h~commit-fs	Mon Dec  2 18:40:23 2002
+++ linux-akpm-akpm/include/linux/fs.h	Mon Dec  2 18:40:56 2002
@@ -894,6 +894,7 @@ struct super_operations {
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
+	int (*sync_fs) (struct super_block *);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
 	int (*statfs) (struct super_block *, struct statfs *);
--- linux-akpm/fs/super.c~commit-fs	Mon Dec  2 18:44:01 2002
+++ linux-akpm-akpm/fs/super.c	Mon Dec  2 18:44:07 2002
@@ -454,6 +454,8 @@ void sync_supers(kdev_t dev)
 		if (sb) {
 			if (sb->s_dirt)
 				write_super(sb);
+			if (sb->s_op && sb->s_op->sync_fs)
+				sb->s_op->sync_fs(sb);
 			drop_super(sb);
 		}
 		return;
@@ -467,6 +469,8 @@ restart:
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
 			write_super(sb);
+			if (sb->s_op && sb->s_op->sync_fs)
+				sb->s_op->sync_fs(sb);
 			drop_super(sb);
 			goto restart;
 		} else

_

and

 fs/ext3/super.c |   25 +++++++++++++------------
 1 files changed, 13 insertions(+), 12 deletions(-)

--- linux-akpm/fs/ext3/super.c~ext3_sync_fs	Mon Dec  2 18:45:10 2002
+++ linux-akpm-akpm/fs/ext3/super.c	Mon Dec  2 18:46:59 2002
@@ -47,6 +47,8 @@ static void ext3_mark_recovery_complete(
 static void ext3_clear_journal_err(struct super_block * sb,
 				   struct ext3_super_block * es);
 
+static int ext3_sync_fs(struct super_block * sb);
+
 #ifdef CONFIG_JBD_DEBUG
 int journal_no_write[2];
 
@@ -454,6 +456,7 @@ static struct super_operations ext3_sops
 	delete_inode:	ext3_delete_inode,	/* BKL not held.  We take it */
 	put_super:	ext3_put_super,		/* BKL held */
 	write_super:	ext3_write_super,	/* BKL held */
+	sync_fs:	ext3_sync_fs,
 	write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
 	unlockfs:	ext3_unlockfs,		/* BKL not held.  We take it */
 	statfs:		ext3_statfs,		/* BKL held */
@@ -1577,24 +1580,22 @@ int ext3_force_commit(struct super_block
  * This implicitly triggers the writebehind on sync().
  */
 
-static int do_sync_supers = 0;
-MODULE_PARM(do_sync_supers, "i");
-MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
-
 void ext3_write_super (struct super_block * sb)
 {
+	if (down_trylock(&sb->s_lock) == 0)
+		BUG();
+	sb->s_dirt = 0;
+	log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+}
+
+static int ext3_sync_fs(struct super_block *sb)
+{
 	tid_t target;
 	
-	if (down_trylock(&sb->s_lock) == 0)
-		BUG();		/* aviro detector */
 	sb->s_dirt = 0;
 	target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
-
-	if (do_sync_supers) {
-		unlock_super(sb);
-		log_wait_commit(EXT3_SB(sb)->s_journal, target);
-		lock_super(sb);
-	}
+	log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	return 0;
 }
 
 /*

_

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03  3:09           ` Andrew Morton
@ 2002-12-03 19:36             ` Bryan Henderson
  2002-12-03 20:06               ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Bryan Henderson @ 2002-12-03 19:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie

>--- linux-akpm/fs/buffer.c~commit-fs            Mon Dec  2 18:40:03 2002
>+++ linux-akpm-akpm/fs/buffer.c           Mon Dec  2 18:41:34 2002
>@@ -327,6 +327,8 @@ int fsync_super(struct super_block *sb)
>            lock_super(sb);
>            if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
>                        sb->s_op->write_super(sb);
>+           if (sb->s_op && sb->s_op->sync_fs)
>+                       sb->s_op->sync_fs(sb);
>            unlock_super(sb);
>            unlock_kernel();
> etc.

That's very close to solving a major problem I've been struggling with for
a while in this area.  There seems to be a big flaw, though.

I have a filesystem type (STFS) that has no super block.  According to the
spirit of the Linux design, then, the s_dirt flag has no meaning and the
filesystem driver has no business having a ->write_super routine.

But of course a ->write_super call is the only evidence I have that the
user did a sync(), which is something I need to know.  Unfortunately, a
->write_super call can also simply mean kupdated is doing its periodic
thing.  So the code I had to write to implement sync(), while not looking
like ext2 in a bunch of other ways, horribly abuses the interface, works
differently on different versions of Linux, and in some circumstances isn't
even 100% correct.

A driver for the same filesystem type on AIX has no problem because AIX has
an equivalent to ->sync_fs and nothing like ->write_super (AIX VFS is more
general -- doesn't pretend to know about things like super blocks).

So this change looks like it makes Linux as hospitable as AIX to STFS, but
here's where it appears to fall short:  It still uses the same VFS call to
implement the sync() system call as to implement kupdated's purpose.
Could there be a separate function or an argument, or do I have to continue
looking at the process name to distinguish them?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03 19:36             ` Bryan Henderson
@ 2002-12-03 20:06               ` Andrew Morton
  2002-12-03 21:41                 ` Bryan Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-03 20:06 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie

Bryan Henderson wrote:
> 
> ...
> But of course a ->write_super call is the only evidence I have that the
> user did a sync(), which is something I need to know.

You and the rest of the world.

>  Unfortunately, a
> ->write_super call can also simply mean kupdated is doing its periodic
> thing.  So the code I had to write to implement sync(), while not looking
> like ext2 in a bunch of other ways, horribly abuses the interface, works
> differently on different versions of Linux, and in some circumstances isn't
> even 100% correct.
> 
> A driver for the same filesystem type on AIX has no problem because AIX has
> an equivalent to ->sync_fs and nothing like ->write_super (AIX VFS is more
> general -- doesn't pretend to know about things like super blocks).
> 
> So this change looks like it makes Linux as hospitable as AIX to STFS, but
> here's where it appears to fall short:  It still uses the same VFS call to
> implement the sync() system call as to implement kupdated's purpose.
> Could there be a separate function or an argument, or do I have to continue
> looking at the process name to distinguish them?

Oh darn.  Yes, both kupdate and sync call sync_supers().  But
kupdate should not call ->sync_fs.

I'll fix that up.  The intent is that ->sync_fs() should only be
called for sync, unmount, etc.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03 20:06               ` Andrew Morton
@ 2002-12-03 21:41                 ` Bryan Henderson
  2002-12-03 22:13                   ` Andrew Morton
  0 siblings, 1 reply; 21+ messages in thread
From: Bryan Henderson @ 2002-12-03 21:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: akpm, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen C. Tweedie

>The intent is that ->sync_fs() should only be
>called for sync, unmount, etc.

Why unmount, etc.?  That's just more assumption-making by FS as to what
these operations require for every filesystem type.

unmount, which in general requires a rather different kind of syncing than
sync(), already has its own VFS call.
I don't know what etc. would be but it's probably also something a
filesystem driver might want to distinguish from sync().

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03 21:41                 ` Bryan Henderson
@ 2002-12-03 22:13                   ` Andrew Morton
  2002-12-04  2:05                     ` Bryan Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-03 22:13 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie

Bryan Henderson wrote:
> 
> >The intent is that ->sync_fs() should only be
> >called for sync, unmount, etc.
> 
> Why unmount, etc.?  That's just more assumption-making by FS as to what
> these operations require for every filesystem type.
> 
> unmount, which in general requires a rather different kind of syncing than
> sync(), already has its own VFS call.
> I don't know what etc. would be but it's probably also something a
> filesystem driver might want to distinguish from sync().

Well, I'd say that telling the fs about umount activity is "out
of scope" for this exercise ;)

It seems reasonable to have a super_op which says "sync everything
to disk".  And to call that in response to /bin/sync and unmount.

Doesn't it?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-03 22:13                   ` Andrew Morton
@ 2002-12-04  2:05                     ` Bryan Henderson
  2002-12-04  4:29                       ` Andrew Morton
  2002-12-04 21:10                       ` Stephen C. Tweedie
  0 siblings, 2 replies; 21+ messages in thread
From: Bryan Henderson @ 2002-12-04  2:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: akpm, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen C. Tweedie

>Well, I'd say that telling the fs about umount activity is "out
>of scope" for this exercise ;)

Not sure what your point is here.  The filesystem driver already knows when
the filesystem image is being destroyed (i.e. the struct super_block is
going away), which is the only umount activity I'm talking about.  If
destroying a filesystem image of a particular type means cached stuff has
to get sync'ed, the filesystem driver knows to do it then; there's no need
for a ->sync_fs call in addition to ->put_super.

>It seems reasonable to have a super_op which says "sync everything
>to disk".

Well, maybe, but the question is when should FS call that?  I say
performing the sync() system call is the only time it should.  If it calls
it for any other reason, it is making assumptions about how the particular
filesystem type works.

Because "sync everything to disk" is ambiguous in the modern diverse world
of filesystems (so is fsync() -- note that standards define these pretty
vaguely in recognition of that), the only sensible way to define it is to
say "it's what sync() does -- ask your filesystem type designer what sync()
does on your filesystem."

If there's any other writing of cached data to disk that needs to be done,
I say the filesystem driver knows when to do it; FS should not presume to
know.

Of course, in ext2 and its ilk, FS understands the inner workings and is
deeply involved in caching and uses the traditional ->write_super, etc. to
carry it out.  We're only talking about new, more general kinds of
filesystems now.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04  2:05                     ` Bryan Henderson
@ 2002-12-04  4:29                       ` Andrew Morton
  2002-12-04 19:00                         ` Bryan Henderson
  2002-12-04 21:10                       ` Stephen C. Tweedie
  1 sibling, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2002-12-04  4:29 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie

Bryan Henderson wrote:
> 
> >Well, I'd say that telling the fs about umount activity is "out
> >of scope" for this exercise ;)
> 
> Not sure what your point is here.  The filesystem driver already knows when
> the filesystem image is being destroyed (i.e. the struct super_block is
> going away), which is the only umount activity I'm talking about.  If
> destroying a filesystem image of a particular type means cached stuff has
> to get sync'ed, the filesystem driver knows to do it then; there's no need
> for a ->sync_fs call in addition to ->put_super.

In the context of the 2.4 kernel it is not practical to change every
filesystem to sync itself at unmount.

Nor in 2.5, by my hand...

> >It seems reasonable to have a super_op which says "sync everything
> >to disk".
> 
> Well, maybe, but the question is when should FS call that?  I say
> performing the sync() system call is the only time it should.  If it calls
> it for any other reason, it is making assumptions about how the particular
> filesystem type works.

Yes, it always has.  And Linux's tradition of forcing the filesystems
to comply with the VFS's and VM's requirements has always been a
strength.  Because we can make deep changes to the VM and VFS without
having to make deep changes to filesytems.

Yes, this makes porting filesystems from other OS'es harder.  That is
tough luck.  It's better this way.  Filesystems which do weird stuff
are a pain - they drag down core kernel development.

> Because "sync everything to disk" is ambiguous in the modern diverse world
> of filesystems (so is fsync() -- note that standards define these pretty
> vaguely in recognition of that), the only sensible way to define it is to
> say "it's what sync() does -- ask your filesystem type designer what sync()
> does on your filesystem."
> 
> If there's any other writing of cached data to disk that needs to be done,
> I say the filesystem driver knows when to do it; FS should not presume to
> know.

The move is more toward "if the fs needs to do special stuff then it
should have a hook to do that".  In concrete terms, this means rather
than "write your superblock now" we need to tell the fs what's happening
at a higher level, and if the fs isn't interested, perform the default
(ext2-style) operation.

I don't think anyone has a problem with that approach.  And as new
filesystems are added to Linux, each one brings in its own set
of new hooks.  And I think that's a fine approach - we should not
be speculatively adding hooks if they have no in-kernel consumers.

> Of course, in ext2 and its ilk, FS understands the inner workings and is
> deeply involved in caching and uses the traditional ->write_super, etc. to
> carry it out.  We're only talking about new, more general kinds of
> filesystems now.

->write_super() seems to be a bit of a crock.  It _should_ be "the VFS
superblock info has changed.  You should propagate that to pagecache
but not start IO against it".  It's not clear what it actually means
at present.

But for your fs, I suspect we'll need to see specific descriptions
of the problems which are being encountered, and patches.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04  4:29                       ` Andrew Morton
@ 2002-12-04 19:00                         ` Bryan Henderson
  2002-12-04 19:23                           ` Matthew Wilcox
  0 siblings, 1 reply; 21+ messages in thread
From: Bryan Henderson @ 2002-12-04 19:00 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie

>In the context of the 2.4 kernel it is not practical to change every
>filesystem to sync itself at unmount.
>
>Nor in 2.5, by my hand...

I agree.  They would continue to be synced at unmount time by FS like they
are today.  They would continue not to implement a ->fs_sync routine.  I
thought we were talking about expanding the VFS interface so that
filesystem types for which the existing system is inadequate can still
work.

>Linux's tradition of forcing the filesystems
>to comply with the VFS's and VM's requirements has always been a
>strength.

OK, well at least we found the heart of the disagreement.  I consider that
a weakness and favor flexibility.

>Yes, this makes porting filesystems from other OS'es harder.  That is
>tough luck.  It's better this way.  Filesystems which do weird stuff
>are a pain - they drag down core kernel development.

It's not so much porting from other OSes -- that isn't really important.
It's implementing new concepts where loading a new kernel module into an
existing system is acceptable, but replacing the base kernel in that system
is not.  In the particular case of my filesystem type -- STFS -- that is
possible with AIX, Solaris, and Windows but not Linux.  And that's not
because those systems were designed for the particular needs of STFS or
vice versa -- it's just because their "VFS" interface is at a higher level.

I'm not arguing; everyone has different goals for Linux in general and for
kernel.org kernels in particular, and they're often quite different from
the designers' goals for those other operating systems.  I'm just exposing
the big picture.

>we should not be speculatively adding hooks if they have no in-kernel
consumers.

That policy is pretty hostile toward commercial use of Linux (which is what
my job at IBM is concerned with).  In the commercial world, it is generally
required that you be able to add a new function to an existing system by
doing something as simple as loading a new kernel module.  The truth is
that for STFS, it's already too late (for commercial practicality) to talk
about fixing the sync problem via kernel.org kernels.  I've just been using
it as an example and hoping the problem could be avoided for the next novel
filesystem type to come along.

>->write_super() seems to be a bit of a crock.  It _should_ be "the VFS
>superblock info has changed.  You should propagate that to pagecache
>but not start IO against it".  It's not clear what it actually means
>at present.

However one defines it (and in Linux definitions of interfaces are strictly
matters of opinion), it _should_ be either an imperative interface or a
declarative one; your proposal is both at the same time.  An imperative
interface call says, "Do X.  It's none of your business why I think you
should."  A declarative interface says, "X is now true.  Do whatever you
think you should based on that information."  So I would go either with
"superblock info has changed" or "write your superblock into the
pagecache," but not both.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04 19:00                         ` Bryan Henderson
@ 2002-12-04 19:23                           ` Matthew Wilcox
  2002-12-04 22:17                             ` Bryan Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Matthew Wilcox @ 2002-12-04 19:23 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen C. Tweedie

On Wed, Dec 04, 2002 at 11:00:19AM -0800, Bryan Henderson wrote:
> >Linux's tradition of forcing the filesystems
> >to comply with the VFS's and VM's requirements has always been a
> >strength.
> 
> OK, well at least we found the heart of the disagreement.  I consider that
> a weakness and favor flexibility.

That's certainly a matter of opinion.  I'm quite firmly on the same side as Andrew, and I offer supporting material:

http://www.caldo.demon.co.uk/doc/taste.pdf (Section 3.2 is probably most
germane, but it's a short paper and worth reading in full).

> In the particular case of my filesystem type -- STFS -- that is
> possible with AIX, Solaris, and Windows but not Linux.

What is STFS, btw?

> > we should not be speculatively adding hooks if they have no in-kernel
> > consumers.
> 
> That policy is pretty hostile toward commercial use of Linux (which is what
> my job at IBM is concerned with).

That's not true.  It's hostile towards non-source extensions of Linux.
That's always been abundantly clear.

-- 
"It's not Hollywood.  War is real, war is primarily not about defeat or
victory, it is about death.  I've seen thousands and thousands of dead bodies.
Do you think I want to have an academic debate on this subject?" -- Robert Fisk

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04  2:05                     ` Bryan Henderson
  2002-12-04  4:29                       ` Andrew Morton
@ 2002-12-04 21:10                       ` Stephen C. Tweedie
  2002-12-04 22:46                         ` Bryan Henderson
  1 sibling, 1 reply; 21+ messages in thread
From: Stephen C. Tweedie @ 2002-12-04 21:10 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen Tweedie

Hi,

On Wed, 2002-12-04 at 02:05, Bryan Henderson wrote:

> >Well, I'd say that telling the fs about umount activity is "out
> >of scope" for this exercise ;)
> 
> Not sure what your point is here.  The filesystem driver already knows when
> the filesystem image is being destroyed (i.e. the struct super_block is
> going away), which is the only umount activity I'm talking about.  If
> destroying a filesystem image of a particular type means cached stuff has
> to get sync'ed, the filesystem driver knows to do it then; there's no need
> for a ->sync_fs call in addition to ->put_super.

Unfortunately, for 2.4 that is not true.  ->put_super() cannot do the
cache flush.  It can't do it, because by the time we call it, the core
VFS layer has already completely flushed the cache from memory, so it's
too late.  That's why we need a separate call to indicate the start of
the umount process; put_super only marks the end of it when it's too
late to do this particular piece of processing.

> >It seems reasonable to have a super_op which says "sync everything
> >to disk".
> 
> Well, maybe, but the question is when should FS call that?

And more importantly, when is it finished?  That's the piece of the
equation that's missing in the umount case.

--Stephen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04 19:23                           ` Matthew Wilcox
@ 2002-12-04 22:17                             ` Bryan Henderson
  2002-12-05 10:36                               ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 21+ messages in thread
From: Bryan Henderson @ 2002-12-04 22:17 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen C. Tweedie

>What is STFS, btw?

It's Storage Tank File System -- a shared filesystem type that IBM has been
talking about making for a couple of years.  It's supposed to allow you to
put a single filesystem on a large number of shared disks and have all the
processes on all those systems access the filesystem as if they were all on
one big system.  But those disks contain only data blocks.  Metadata is
kept behind a server which is accessed via conventional network interfaces.
Clients of that metadata server aggressively cache metadata in order to
provide performance equivalent to local filesystems
in usual cases.

The various systems can be running various operating systems, unix and
Windows alike.  It would be nice if some could be Linux.

Linux FS assumes in many ways that the contents of a filesystem don't
change except through it.  The basic FS structure has been bent (with
various revalidate VFS calls) to make NFS work, but NFS does not promise
(or deliver) perfect file sharing.  The fact that FS presumes to be able to
manage a cache of filesystem contents interferes with the STFS filesystem
driver's need to coordinate file contents with other systems that see the
same disks and metadata servers.

Another way it has been hard to get correct STFS semantics out of Linux is
that the cache the filesystem driver keeps is there whether or not you have
the filesystem mounted (or mounted 10 times on the same system).  So
unmounting does not necessarily mean you need to sync the cache, but I
still believe if a user invokes sync(), he expects all updates in that
cache to be made permanent.  Hence my wish for FS to have a "sync
everything" VFS call,
but not to call it just because it is unmounting.

>> > we should not be speculatively adding hooks if they have no in-kernel
>> > consumers.
>>
>> That policy is pretty hostile toward commercial use of Linux (which is
what
>> my job at IBM is concerned with).
>
>That's not true.  It's hostile towards non-source extensions of Linux.

I assume "non-source extension" means an extension that isn't distributed
in the Linux kernel source tree, right?

My claim is that commercial use is highly dependent on non-source
extensions, so hostile toward non-source extensions means hostile toward
commercial use.  Commerce works by consumers getting stuff from multiple
independent suppliers, who supply to multiple independent consumers.  I
point to traditional commercial operating systems for examples:  AIX's VFS
interface allows for kernel extensions supplied by companies that have no
working relationship with IBM and which do things no one else was doing at
the time AIX shipped.

>That's always been abundantly clear.

I agree.  Its one of the things people cite whenever I try to include Linux
in a commercial endeavor.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04 21:10                       ` Stephen C. Tweedie
@ 2002-12-04 22:46                         ` Bryan Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Bryan Henderson @ 2002-12-04 22:46 UTC (permalink / raw)
  To: Stephen C. Tweedie
  Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason,
	Stephen Tweedie

>->put_super() cannot do the
>cache flush.  It can't do it, because by the time we call it, the core
>VFS layer has already completely flushed the cache from memory,

I think this ->fs_sync call we've been talking about is something different
from what I thought it was.  Is this about pushing updates which are cached
above the page cache into the page cache where normal page cache cleaning
can effect the actual writing?

If so, that's much more specialized than what I imagined -- it's applicable
only to a fairly specific filesystem driver implementation.  I'd probably
call that something like ->fs_prepare_for_sync instead.

Not that it won't get abused just like ->write_super by folks who just want
to detect when someone does sync() :-)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-04 22:17                             ` Bryan Henderson
@ 2002-12-05 10:36                               ` Arnaldo Carvalho de Melo
  2002-12-05 16:31                                 ` Stephen C. Tweedie
  0 siblings, 1 reply; 21+ messages in thread
From: Arnaldo Carvalho de Melo @ 2002-12-05 10:36 UTC (permalink / raw)
  To: Bryan Henderson
  Cc: Matthew Wilcox, Andrew Morton, linux-fsdevel@vger.kernel.org,
	Chris Mason, Stephen C. Tweedie

Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu:
> 
> >What is STFS, btw?
> 
> It's Storage Tank File System -- a shared filesystem type that IBM has been
> talking about making for a couple of years.  It's supposed to allow you to
> put a single filesystem on a large number of shared disks and have all the
> processes on all those systems access the filesystem as if they were all on
> one big system.  But those disks contain only data blocks.  Metadata is
> kept behind a server which is accessed via conventional network interfaces.
> Clients of that metadata server aggressively cache metadata in order to
> provide performance equivalent to local filesystems
> in usual cases.

Alert: FS newbie talking.

Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org

- Arnaldo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-05 10:36                               ` Arnaldo Carvalho de Melo
@ 2002-12-05 16:31                                 ` Stephen C. Tweedie
  2002-12-05 17:24                                   ` girish
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen C. Tweedie @ 2002-12-05 16:31 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Bryan Henderson, Matthew Wilcox, Andrew Morton,
	linux-fsdevel@vger.kernel.org, Chris Mason, Stephen Tweedie

Hi,

On Thu, 2002-12-05 at 10:36, Arnaldo Carvalho de Melo wrote:
> Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu:

> > It's Storage Tank File System -- a shared filesystem type that IBM has been
> > talking about making for a couple of years.  It's supposed to allow you to
> > put a single filesystem on a large number of shared disks and have all the
> > processes on all those systems access the filesystem as if they were all on
> > one big system.  But those disks contain only data blocks.  Metadata is
> > kept behind a server which is accessed via conventional network interfaces.

> Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org

Yes, sounds very similar indeed.

Cheers,
 Stephen


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] 2.5.x write_super is not for syncing
  2002-12-05 16:31                                 ` Stephen C. Tweedie
@ 2002-12-05 17:24                                   ` girish
  0 siblings, 0 replies; 21+ messages in thread
From: girish @ 2002-12-05 17:24 UTC (permalink / raw)
  To: Stephen C. Tweedie, Arnaldo Carvalho de Melo
  Cc: Bryan Henderson, Matthew Wilcox, Andrew Morton, linux-fsdevel,
	Chris Mason, Stephen Tweedie

> Hi,
>
> On Thu, 2002-12-05 at 10:36, Arnaldo Carvalho de Melo wrote:
> > Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu:
>
> > > It's Storage Tank File System -- a shared filesystem type that IBM has
been
> > > talking about making for a couple of years.  It's supposed to allow
you to
> > > put a single filesystem on a large number of shared disks and have all
the
> > > processes on all those systems access the filesystem as if they were
all on
> > > one big system.  But those disks contain only data blocks.  Metadata
is
> > > kept behind a server which is accessed via conventional network
interfaces.
>
> > Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org
>
> Yes, sounds very similar indeed.
In lustre we have network connection between client and data servers, where
as STFS  is similar to CXFS from SGI

Regards
Girish


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2002-12-05 17:24 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-16 18:00 [PATCH] 2.5.x write_super is not for syncing Chris Mason
2002-12-02 22:07 ` Andrew Morton
2002-12-02 23:40   ` Chris Mason
2002-12-03  0:03     ` Andrew Morton
2002-12-03  1:10       ` Chris Mason
2002-12-03  1:40         ` Andrew Morton
2002-12-03  3:09           ` Andrew Morton
2002-12-03 19:36             ` Bryan Henderson
2002-12-03 20:06               ` Andrew Morton
2002-12-03 21:41                 ` Bryan Henderson
2002-12-03 22:13                   ` Andrew Morton
2002-12-04  2:05                     ` Bryan Henderson
2002-12-04  4:29                       ` Andrew Morton
2002-12-04 19:00                         ` Bryan Henderson
2002-12-04 19:23                           ` Matthew Wilcox
2002-12-04 22:17                             ` Bryan Henderson
2002-12-05 10:36                               ` Arnaldo Carvalho de Melo
2002-12-05 16:31                                 ` Stephen C. Tweedie
2002-12-05 17:24                                   ` girish
2002-12-04 21:10                       ` Stephen C. Tweedie
2002-12-04 22:46                         ` Bryan Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.