* [PATCH] 2.5.x write_super is not for syncing
@ 2002-10-16 18:00 Chris Mason
2002-12-02 22:07 ` Andrew Morton
0 siblings, 1 reply; 21+ messages in thread
From: Chris Mason @ 2002-10-16 18:00 UTC (permalink / raw)
To: linux-kernel
Hello all,
This patch adds a commit_super super operation that allows the journaled
filesystems to do something different for the periodic write_super calls
and sync.
Based on comments from Al about my last patch, alloc_super sets a
default empty super_operations struct on each super. This allows us to
get rid of all checks for sb->s_ops == NULL.
sync_supers is changed so that it doesn't loop on a single FS if the
write_super call leaves sb->s_dirt set. I did this by changing
generic_shutdown_super to use list_del_init(&sb->s_list), which allows
us to check for supers that have been removed from the super_blocks list
while we slept. The idea came from an sgi patch Hugh Dickins sent me.
Anyway, this is against 2.5.43, please review:
--- 1.161/fs/buffer.c Tue Oct 8 14:40:47 2002
+++ edited/fs/buffer.c Wed Oct 16 11:39:04 2002
@@ -217,8 +217,10 @@
sync_inodes_sb(sb, 0);
DQUOT_SYNC(sb);
lock_super(sb);
- if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
+ if (sb->s_dirt && sb->s_op->write_super)
sb->s_op->write_super(sb);
+ if (sb->s_dirt && sb->s_op->commit_super)
+ sb->s_op->commit_super(sb);
unlock_super(sb);
sync_blockdev(sb->s_bdev);
sync_inodes_sb(sb, 1);
@@ -251,7 +253,7 @@
wakeup_bdflush(0);
sync_inodes(0); /* All mappings and inodes, including block devices */
DQUOT_SYNC(NULL);
- sync_supers(); /* Write the superblocks */
+ commit_supers(); /* Write the superblocks */
sync_inodes(1); /* All the mappings and inodes, again. */
return 0;
}
@@ -274,7 +276,7 @@
/* sync the superblock to buffers */
sb = inode->i_sb;
lock_super(sb);
- if (sb->s_op && sb->s_op->write_super)
+ if (sb->s_op->write_super)
sb->s_op->write_super(sb);
unlock_super(sb);
--- 1.22/fs/fs-writeback.c Sun Sep 22 17:26:49 2002
+++ edited/fs/fs-writeback.c Wed Oct 16 13:29:51 2002
@@ -57,7 +57,7 @@
* dirty the inode itself
*/
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- if (sb->s_op && sb->s_op->dirty_inode)
+ if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode);
}
@@ -103,8 +103,7 @@
static void write_inode(struct inode *inode, int sync)
{
- if (inode->i_sb->s_op && inode->i_sb->s_op->write_inode &&
- !is_bad_inode(inode))
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
inode->i_sb->s_op->write_inode(inode, sync);
}
--- 1.83/fs/super.c Mon Sep 9 17:00:57 2002
+++ edited/fs/super.c Wed Oct 16 10:36:09 2002
@@ -48,6 +48,8 @@
*/
static struct super_block *alloc_super(void)
{
+ static struct super_operations default_op = {};
+
struct super_block *s = kmalloc(sizeof(struct super_block), GFP_USER);
if (s) {
memset(s, 0, sizeof(struct super_block));
@@ -72,6 +74,7 @@
s->s_maxbytes = MAX_NON_LFS;
s->dq_op = sb_dquot_ops;
s->s_qcop = sb_quotactl_ops;
+ s->s_op = &default_op;
}
out:
return s;
@@ -203,7 +206,12 @@
unlock_super(sb);
}
spin_lock(&sb_lock);
- list_del(&sb->s_list);
+
+ /*
+ * use list_del_init so we can tell later if a super with an
+ * incremented count has been removed from all lists
+ */
+ list_del_init(&sb->s_list);
list_del(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
@@ -267,35 +275,65 @@
{
lock_super(sb);
if (sb->s_root && sb->s_dirt)
- if (sb->s_op && sb->s_op->write_super)
+ if (sb->s_op->write_super)
sb->s_op->write_super(sb);
unlock_super(sb);
}
+static inline void commit_super(struct super_block *sb)
+{
+ lock_super(sb);
+ if (sb->s_root && sb->s_dirt) {
+ if (sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ if (sb->s_op->commit_super)
+ sb->s_op->commit_super(sb);
+ }
+ unlock_super(sb);
+}
+
/*
* Note: check the dirty flag before waiting, so we don't
* hold up the sync while mounting a device. (The newly
* mounted device won't need syncing.)
*/
-void sync_supers(void)
+void dirty_super_op(void (*func)(struct super_block *))
{
struct super_block * sb;
-restart:
spin_lock(&sb_lock);
+restart:
sb = sb_entry(super_blocks.next);
- while (sb != sb_entry(&super_blocks))
+ while (sb != sb_entry(&super_blocks)) {
if (sb->s_dirt) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- write_super(sb);
- drop_super(sb);
- goto restart;
- } else
- sb = sb_entry(sb->s_list.next);
+ func(sb);
+ up_read(&sb->s_umount);
+
+ spin_lock(&sb_lock);
+ if (!--sb->s_count) {
+ destroy_super(sb);
+ goto restart;
+ } else if (list_empty(&sb->s_list)) {
+ goto restart;
+ }
+ }
+ sb = sb_entry(sb->s_list.next);
+ }
spin_unlock(&sb_lock);
}
+void sync_supers(void)
+{
+ dirty_super_op(write_super);
+}
+
+void commit_supers(void)
+{
+ dirty_super_op(commit_super);
+}
+
/**
* get_super - get the superblock of a device
* @dev: device to get the superblock for
@@ -396,7 +434,7 @@
if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY))
if (!fs_may_remount_ro(sb))
return -EBUSY;
- if (sb->s_op && sb->s_op->remount_fs) {
+ if (sb->s_op->remount_fs) {
lock_super(sb);
retval = sb->s_op->remount_fs(sb, &flags, data);
unlock_super(sb);
--- 1.170/include/linux/fs.h Fri Oct 11 04:49:46 2002
+++ edited/include/linux/fs.h Wed Oct 16 10:42:16 2002
@@ -818,6 +818,7 @@
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct vfsmount *);
+ void (*commit_super) (struct super_block *);
};
/* Inode state bits. Protected by inode_lock. */
@@ -1141,6 +1142,7 @@
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
extern void sync_supers(void);
+extern void commit_supers(void);
extern sector_t bmap(struct inode *, sector_t);
extern int notify_change(struct dentry *, struct iattr *);
extern int permission(struct inode *, int);
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH] 2.5.x write_super is not for syncing 2002-10-16 18:00 [PATCH] 2.5.x write_super is not for syncing Chris Mason @ 2002-12-02 22:07 ` Andrew Morton 2002-12-02 23:40 ` Chris Mason 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-02 22:07 UTC (permalink / raw) To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org Six weeks ago, Chris Mason wrote: > > Hello all, > > This patch adds a commit_super super operation that allows the journaled > filesystems to do something different for the periodic write_super calls > and sync. This is the "right fix" for the ext3 data=journal problem. > Based on comments from Al about my last patch, alloc_super sets a > default empty super_operations struct on each super. This allows us to > get rid of all checks for sb->s_ops == NULL. > > sync_supers is changed so that it doesn't loop on a single FS if the > write_super call leaves sb->s_dirt set. I did this by changing > generic_shutdown_super to use list_del_init(&sb->s_list), which allows > us to check for supers that have been removed from the super_blocks list > while we slept. The idea came from an sgi patch Hugh Dickins sent me. > > Anyway, this is against 2.5.43, please review: > > --- 1.161/fs/buffer.c Tue Oct 8 14:40:47 2002 > +++ edited/fs/buffer.c Wed Oct 16 11:39:04 2002 > @@ -217,8 +217,10 @@ > sync_inodes_sb(sb, 0); > DQUOT_SYNC(sb); > lock_super(sb); > - if (sb->s_dirt && sb->s_op && sb->s_op->write_super) > + if (sb->s_dirt && sb->s_op->write_super) > sb->s_op->write_super(sb); > + if (sb->s_dirt && sb->s_op->commit_super) > + sb->s_op->commit_super(sb); Except for the s_dirt test in here. If s_dirt is zero and we have dirty inodes, the filesystem _still_ is not told what to do. I don't think we'll ever get this right until we start telling the filesystem what's happening. So instead of all these little presumptuous micro-syncs which we're doing in there, we need to turn this inside out and just call sb->s_op->sync_everything_for_umount() and let the fs decide how to get everything tight on disk. That's a bit drastic. At a minimum we need to remove that s_dirt test and make the commit_super() call unconditional. What would that break? ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-02 22:07 ` Andrew Morton @ 2002-12-02 23:40 ` Chris Mason 2002-12-03 0:03 ` Andrew Morton 0 siblings, 1 reply; 21+ messages in thread From: Chris Mason @ 2002-12-02 23:40 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-fsdevel@vger.kernel.org On Mon, 2002-12-02 at 17:07, Andrew Morton wrote: > Except for the s_dirt test in here. If s_dirt is zero and we > have dirty inodes, the filesystem _still_ is not told what to > do. Perhaps dirty inodes imply s_dirt should be true? > > I don't think we'll ever get this right until we start telling the > filesystem what's happening. So instead of all these little > presumptuous micro-syncs which we're doing in there, we need to turn > this inside out and just call sb->s_op->sync_everything_for_umount() > and let the fs decide how to get everything tight on disk. > > That's a bit drastic. At a minimum we need to remove that s_dirt > test and make the commit_super() call unconditional. > > What would that break? Not much, since foofs_commit_super could always check for s_dirt if the local FS really cared. The downside is that since we don't check for s_dirt in commit_supers(), the FS might get sunk twice if the we need to restart at the head of the supers list due to unmount. But, a double sync is possible anyway under FS load in the same situation. -chris ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-02 23:40 ` Chris Mason @ 2002-12-03 0:03 ` Andrew Morton 2002-12-03 1:10 ` Chris Mason 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-03 0:03 UTC (permalink / raw) To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org Chris Mason wrote: > > On Mon, 2002-12-02 at 17:07, Andrew Morton wrote: > > > Except for the s_dirt test in here. If s_dirt is zero and we > > have dirty inodes, the filesystem _still_ is not told what to > > do. > > Perhaps dirty inodes imply s_dirt should be true? Well afaik (why doesn't anybody ever comment stuff? Do they prefer buggy software?) s_dirt purely means "the superblock needs to be written out". > > > > I don't think we'll ever get this right until we start telling the > > filesystem what's happening. So instead of all these little > > presumptuous micro-syncs which we're doing in there, we need to turn > > this inside out and just call sb->s_op->sync_everything_for_umount() > > and let the fs decide how to get everything tight on disk. > > > > That's a bit drastic. At a minimum we need to remove that s_dirt > > test and make the commit_super() call unconditional. > > > > What would that break? > > Not much, since foofs_commit_super could always check for s_dirt if the > local FS really cared. > > The downside is that since we don't check for s_dirt in commit_supers(), > the FS might get sunk twice if the we need to restart at the head of the > supers list due to unmount. But, a double sync is possible anyway under > FS load in the same situation. > But you do check: +static inline void commit_super(struct super_block *sb) +{ + lock_super(sb); + if (sb->s_root && sb->s_dirt) { + if (sb->s_op->write_super) + sb->s_op->write_super(sb); + if (sb->s_op->commit_super) + sb->s_op->commit_super(sb); + } + unlock_super(sb); +} + I'm thinking that for this particular problem, the solution isn't quite appropriate. We don't really give a toss about the dirty state of the superblock. We want to sync the darn filesystem. Sure, we can do that in ->commit_super(), but it really has nothing to do with the superblock. What about something along these lines? --- 25/fs/buffer.c~sync-fs Mon Dec 2 16:00:18 2002 +++ 25-akpm/fs/buffer.c Mon Dec 2 16:02:37 2002 @@ -213,7 +213,7 @@ EXPORT_SYMBOL(sync_blockdev); * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ -int fsync_super(struct super_block *sb) +static int __sync_filesystem(struct super_block *sb) { sync_inodes_sb(sb, 0); DQUOT_SYNC(sb); @@ -225,6 +225,13 @@ int fsync_super(struct super_block *sb) sync_inodes_sb(sb, 1); return sync_blockdev(sb->s_bdev); +} + +int sync_filesystem(struct super_block *sb) +{ + if (sb->s_op && sb->s_op->sync_filesystem) + return sb->s_op->sync_filesystem(sb); + return __sync_filesytem(sb); } /* _ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 0:03 ` Andrew Morton @ 2002-12-03 1:10 ` Chris Mason 2002-12-03 1:40 ` Andrew Morton 0 siblings, 1 reply; 21+ messages in thread From: Chris Mason @ 2002-12-03 1:10 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-fsdevel@vger.kernel.org On Mon, 2002-12-02 at 19:03, Andrew Morton wrote: > Chris Mason wrote: > > > > On Mon, 2002-12-02 at 17:07, Andrew Morton wrote: > > > > > Except for the s_dirt test in here. If s_dirt is zero and we > > > have dirty inodes, the filesystem _still_ is not told what to > > > do. > > > > Perhaps dirty inodes imply s_dirt should be true? > > Well afaik (why doesn't anybody ever comment stuff? Do they prefer > buggy software?) s_dirt purely means "the superblock needs to be > written out". Sorry, neither of my comments were horribly clear. Regardless of s_dirt's current definition, s_dirt could mean 'something on the FS needs to be written, and only the FS knows how to do it', or some other wording that covers both the 'call me for periodic writes' case and the 'call me on sync()' case. I'm not saying we really need to muddy the s_dirt definition like that, it's just an option in case someone really wants s_dirt checks everywhere. > > > That's a bit drastic. At a minimum we need to remove that s_dirt > > > test and make the commit_super() call unconditional. > > > > > > What would that break? > > > > Not much, since foofs_commit_super could always check for s_dirt if the > > local FS really cared. > > > > The downside is that since we don't check for s_dirt in commit_supers(), > > the FS might get sunk twice if the we need to restart at the head of the > > supers list due to unmount. But, a double sync is possible anyway under > > FS load in the same situation. > > > > But you do check: Right, I meant that if we change the patch to remove the s_dirt check from the commit_supers() function, a single FS might get sunk twice. That risk already exists when the FS is under load though, so I'm not horribly worried about it. > I'm thinking that for this particular problem, the solution isn't > quite appropriate. We don't really give a toss about the dirty > state of the superblock. We want to sync the darn filesystem. > Sure, we can do that in ->commit_super(), but it really has nothing > to do with the superblock. > > What about something along these lines? You'd still want sync_filesystem to get calls in the sys_sync case though, right? I'd be happy to rework the commit_super patch into a sync_filesystem patch that doesn't bother with s_dirt except when calling write_super(). The patch has two parts that matter in my mind: 1) In sync_supers (and other similar funcs), allow the FS to leave s_dirt set. This way we can setup per FS timings on when periodic writes get done. 2) Give the FS a way to tell the difference between periodic writes that don't need to force and wait for a commit, and a request from the user that does require a full commit. -chris ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 1:10 ` Chris Mason @ 2002-12-03 1:40 ` Andrew Morton 2002-12-03 3:09 ` Andrew Morton 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-03 1:40 UTC (permalink / raw) To: Chris Mason; +Cc: linux-fsdevel@vger.kernel.org Chris Mason wrote: > > ... > > What about something along these lines? > > You'd still want sync_filesystem to get calls in the sys_sync case > though, right? yes. > I'd be happy to rework the commit_super patch into a > sync_filesystem patch that doesn't bother with s_dirt except when > calling write_super(). > > The patch has two parts that matter in my mind: > > 1) In sync_supers (and other similar funcs), allow the FS to leave > s_dirt set. This way we can setup per FS timings on when periodic > writes get done. hm. Is that the simplest possible way of doing such a thing? > 2) Give the FS a way to tell the difference between periodic writes that > don't need to force and wait for a commit, and a request from the user > that does require a full commit. yup. The whole sync thing is a mess, really. It's designed for ext2, and other filesystems just graft stuff on top of that, doing magical things by inference. It needs a complete rethink, IMO. And I'm not really inclined to do it in the 2.5 timeframe, mainly because the stuff we have now mostly works, and everybody's busy on other stuff. And we need a 2.4 solution, asap. These two patches are what I'll be testing tonight for 2.5. Documentation/filesystems/Locking | 7 +++++++ fs/buffer.c | 21 +++++++++++++++++++-- fs/dquot.c | 1 + fs/super.c | 5 ++--- include/linux/buffer_head.h | 2 -- include/linux/fs.h | 3 +++ include/linux/writeback.h | 1 - kernel/ksyms.c | 1 + 8 files changed, 33 insertions(+), 8 deletions(-) --- 25/fs/buffer.c~sync-fs Mon Dec 2 16:24:34 2002 +++ 25-akpm/fs/buffer.c Mon Dec 2 16:51:04 2002 @@ -213,7 +213,7 @@ EXPORT_SYMBOL(sync_blockdev); * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ -int fsync_super(struct super_block *sb) +static int __sync_filesystem(struct super_block *sb) { sync_inodes_sb(sb, 0); DQUOT_SYNC(sb); @@ -227,6 +227,23 @@ int fsync_super(struct super_block *sb) return sync_blockdev(sb->s_bdev); } +int sync_filesystem(struct super_block *sb) +{ + int ret; + int cflags = current->flags; + + current->flags |= PF_SYNC; /* ick. This is a "data integrity" + * operation + */ + + if (sb->s_op && sb->s_op->sync_filesystem) + ret = sb->s_op->sync_filesystem(sb); + else + ret = __sync_filesystem(sb); + current->flags = cflags; + return ret; +} + /* * Write out and wait upon all dirty data associated with this * device. Filesystem data as well as the underlying block @@ -236,7 +253,7 @@ int fsync_bdev(struct block_device *bdev { struct super_block *sb = get_super(bdev); if (sb) { - int res = fsync_super(sb); + int res = sync_filesystem(sb); drop_super(sb); return res; } --- 25/include/linux/buffer_head.h~sync-fs Mon Dec 2 16:24:34 2002 +++ 25-akpm/include/linux/buffer_head.h Mon Dec 2 16:24:34 2002 @@ -155,8 +155,6 @@ void __wait_on_buffer(struct buffer_head void sleep_on_buffer(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh); int fsync_bdev(struct block_device *); -int fsync_super(struct super_block *); -int fsync_no_super(struct block_device *); struct buffer_head *__find_get_block(struct block_device *, sector_t, int); struct buffer_head * __getblk(struct block_device *, sector_t, int); void __brelse(struct buffer_head *); --- 25/include/linux/fs.h~sync-fs Mon Dec 2 16:24:34 2002 +++ 25-akpm/include/linux/fs.h Mon Dec 2 16:44:00 2002 @@ -816,6 +816,7 @@ struct super_operations { int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); + int (*sync_filesystem)(struct super_block *); int (*show_options)(struct seq_file *, struct vfsmount *); }; @@ -1311,6 +1312,8 @@ extern struct dentry *simple_lookup(stru extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); extern struct file_operations simple_dir_operations; extern struct inode_operations simple_dir_inode_operations; +extern int sync_filesystem(struct super_block *sb); +extern void sync_inodes_sb(struct super_block *, int wait); #ifdef CONFIG_BLK_DEV_INITRD extern unsigned int real_root_dev; --- 25/fs/super.c~sync-fs Mon Dec 2 16:24:34 2002 +++ 25-akpm/fs/super.c Mon Dec 2 16:24:34 2002 @@ -27,7 +27,6 @@ #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/namei.h> -#include <linux/buffer_head.h> /* for fsync_super() */ #include <linux/mount.h> #include <linux/security.h> #include <asm/uaccess.h> @@ -180,7 +179,7 @@ void generic_shutdown_super(struct super shrink_dcache_parent(root); shrink_dcache_anon(&sb->s_anon); dput(root); - fsync_super(sb); + sync_filesystem(sb); lock_super(sb); lock_kernel(); sb->s_flags &= ~MS_ACTIVE; @@ -391,7 +390,7 @@ int do_remount_sb(struct super_block *sb if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - fsync_super(sb); + sync_filesystem(sb); /* If we are remounting RDONLY, make sure there are no rw files open */ if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) if (!fs_may_remount_ro(sb)) --- 25/Documentation/filesystems/Locking~sync-fs Mon Dec 2 16:27:20 2002 +++ 25-akpm/Documentation/filesystems/Locking Mon Dec 2 16:28:56 2002 @@ -96,6 +96,7 @@ prototypes: int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); + int (*sync_filesystem)(struct super_block *); locking rules: All may block. @@ -111,11 +112,17 @@ write_super: no yes maybe (see below) statfs: no no no remount_fs: yes yes maybe (see below) umount_begin: yes no maybe (see below) +sync_filesystem:no no ?? ->read_inode() is not a method - it's a callback used in iget(). rules for mount_sem are not too nice - it is going to die and be replaced by better scheme anyway. +sync_filesystem() must commit all pending data (and metadata) to disk, +and wait on its writeout. It is a full sync of everything in that fs, +period. + + --------------------------- file_system_type --------------------------- prototypes: struct super_block *(*get_sb) (struct file_system_type *, int, char *, void *); --- 25/include/linux/writeback.h~sync-fs Mon Dec 2 16:43:32 2002 +++ 25-akpm/include/linux/writeback.h Mon Dec 2 16:43:51 2002 @@ -50,7 +50,6 @@ struct writeback_control { void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); void __wait_on_inode(struct inode * inode); -void sync_inodes_sb(struct super_block *, int wait); void sync_inodes(int wait); /* writeback.h requires fs.h; it, too, is not included from here. */ --- 25/kernel/ksyms.c~sync-fs Mon Dec 2 16:56:24 2002 +++ 25-akpm/kernel/ksyms.c Mon Dec 2 16:57:37 2002 @@ -546,6 +546,7 @@ EXPORT_SYMBOL(buffer_insert_list); EXPORT_SYMBOL(make_bad_inode); EXPORT_SYMBOL(is_bad_inode); EXPORT_SYMBOL(__inode_dir_notify); +EXPORT_SYMBOL(sync_inodes_sb); #ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); --- 25/fs/dquot.c~sync-fs Mon Dec 2 16:58:07 2002 +++ 25-akpm/fs/dquot.c Mon Dec 2 16:58:24 2002 @@ -1425,3 +1425,4 @@ EXPORT_SYMBOL(unregister_quota_format); EXPORT_SYMBOL(dqstats); EXPORT_SYMBOL(dq_list_lock); EXPORT_SYMBOL(dq_data_lock); +EXPORT_SYMBOL(sync_dquots); _ fs/ext3/super.c | 85 ++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 61 insertions(+), 24 deletions(-) --- 25/fs/ext3/super.c~ext3-sync-fs Mon Dec 2 17:00:20 2002 +++ 25-akpm/fs/ext3/super.c Mon Dec 2 17:15:53 2002 @@ -29,6 +29,7 @@ #include <linux/blkdev.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "xattr.h" #include "acl.h" @@ -47,6 +48,7 @@ static void ext3_mark_recovery_complete( struct ext3_super_block * es); static void ext3_clear_journal_err(struct super_block * sb, struct ext3_super_block * es); +static int ext3_sync_filesystem(struct super_block *sb); #ifdef CONFIG_JBD_DEBUG int journal_no_write[2]; @@ -495,18 +497,19 @@ static void ext3_clear_inode(struct inod static struct super_operations ext3_sops = { .alloc_inode = ext3_alloc_inode, .destroy_inode = ext3_destroy_inode, - .read_inode = ext3_read_inode, /* BKL held */ - .write_inode = ext3_write_inode, /* BKL not held. Don't need */ - .dirty_inode = ext3_dirty_inode, /* BKL not held. We take it */ - .put_inode = ext3_put_inode, /* BKL not held. Don't need */ - .delete_inode = ext3_delete_inode, /* BKL not held. We take it */ - .put_super = ext3_put_super, /* BKL held */ - .write_super = ext3_write_super, /* BKL not held. We take it. Needed? */ - .write_super_lockfs = ext3_write_super_lockfs, /* BKL not held. Take it */ - .unlockfs = ext3_unlockfs, /* BKL not held. We take it */ - .statfs = ext3_statfs, /* BKL not held. */ - .remount_fs = ext3_remount, /* BKL held */ - .clear_inode = ext3_clear_inode, /* BKL not needed. */ + .read_inode = ext3_read_inode, + .write_inode = ext3_write_inode, + .dirty_inode = ext3_dirty_inode, + .put_inode = ext3_put_inode, + .delete_inode = ext3_delete_inode, + .put_super = ext3_put_super, + .write_super = ext3_write_super, + .sync_filesystem= ext3_sync_filesystem, + .write_super_lockfs = ext3_write_super_lockfs, + .unlockfs = ext3_unlockfs, + .statfs = ext3_statfs, + .remount_fs = ext3_remount, + .clear_inode = ext3_clear_inode, }; struct dentry *ext3_get_parent(struct dentry *child); @@ -1714,25 +1717,59 @@ int ext3_force_commit(struct super_block * This implicitly triggers the writebehind on sync(). */ -static int do_sync_supers = 0; -MODULE_PARM(do_sync_supers, "i"); -MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); - void ext3_write_super (struct super_block * sb) { - tid_t target; lock_kernel(); if (down_trylock(&sb->s_lock) == 0) - BUG(); /* aviro detector */ + BUG(); sb->s_dirt = 0; - target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + log_start_commit(EXT3_SB(sb)->s_journal, NULL); + unlock_kernel(); +} - if (do_sync_supers) { - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - } +static int ext3_sync_filesystem(struct super_block *sb) +{ + tid_t target; + + /* + * The pagecache flush may instantiate new indirects and + * may dirty inodes. + */ + sync_inodes_sb(sb, 0); + + /* + * Quota sync may write to files + */ + DQUOT_SYNC(sb); + + /* + * So sync the files again + */ + sync_inodes_sb(sb, 0); + + /* + * Now run a commit, and wait on it. + */ + lock_kernel(); + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + log_wait_commit(EXT3_SB(sb)->s_journal, target); unlock_kernel(); + + /* + * For data=journal inodes, the commit may have generated new dirty + * buffers. Their metadata is all tight on disk, so just flush the + * pages again. And wait on them. + */ + sync_inodes_sb(sb, 0); /* Nicer IO scheduling */ + sync_inodes_sb(sb, 1); /* Wait here */ + + /* + * And we know that the only dirty buffers will be against the blockdev + * mapping, so write them and wait on them. + */ + sync_blockdev(sb->s_bdev); + return 0; } /* _ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 1:40 ` Andrew Morton @ 2002-12-03 3:09 ` Andrew Morton 2002-12-03 19:36 ` Bryan Henderson 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-03 3:09 UTC (permalink / raw) To: Chris Mason, linux-fsdevel@vger.kernel.org, Stephen C. Tweedie Andrew Morton wrote: > > These two patches are what I'll be testing tonight for 2.5. > I don't feel inclined to take on the whole sync thing at present. So a minimal fix for this problem is as follows. Against 2.4.20: fs/buffer.c | 2 ++ fs/super.c | 4 ++++ include/linux/fs.h | 1 + 3 files changed, 7 insertions(+) --- linux-akpm/fs/buffer.c~commit-fs Mon Dec 2 18:40:03 2002 +++ linux-akpm-akpm/fs/buffer.c Mon Dec 2 18:41:34 2002 @@ -327,6 +327,8 @@ int fsync_super(struct super_block *sb) lock_super(sb); if (sb->s_dirt && sb->s_op && sb->s_op->write_super) sb->s_op->write_super(sb); + if (sb->s_op && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb); unlock_super(sb); unlock_kernel(); --- linux-akpm/include/linux/fs.h~commit-fs Mon Dec 2 18:40:23 2002 +++ linux-akpm-akpm/include/linux/fs.h Mon Dec 2 18:40:56 2002 @@ -894,6 +894,7 @@ struct super_operations { void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); + int (*sync_fs) (struct super_block *); void (*write_super_lockfs) (struct super_block *); void (*unlockfs) (struct super_block *); int (*statfs) (struct super_block *, struct statfs *); --- linux-akpm/fs/super.c~commit-fs Mon Dec 2 18:44:01 2002 +++ linux-akpm-akpm/fs/super.c Mon Dec 2 18:44:07 2002 @@ -454,6 +454,8 @@ void sync_supers(kdev_t dev) if (sb) { if (sb->s_dirt) write_super(sb); + if (sb->s_op && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb); drop_super(sb); } return; @@ -467,6 +469,8 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); write_super(sb); + if (sb->s_op && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb); drop_super(sb); goto restart; } else _ and fs/ext3/super.c | 25 +++++++++++++------------ 1 files changed, 13 insertions(+), 12 deletions(-) --- linux-akpm/fs/ext3/super.c~ext3_sync_fs Mon Dec 2 18:45:10 2002 +++ linux-akpm-akpm/fs/ext3/super.c Mon Dec 2 18:46:59 2002 @@ -47,6 +47,8 @@ static void ext3_mark_recovery_complete( static void ext3_clear_journal_err(struct super_block * sb, struct ext3_super_block * es); +static int ext3_sync_fs(struct super_block * sb); + #ifdef CONFIG_JBD_DEBUG int journal_no_write[2]; @@ -454,6 +456,7 @@ static struct super_operations ext3_sops delete_inode: ext3_delete_inode, /* BKL not held. We take it */ put_super: ext3_put_super, /* BKL held */ write_super: ext3_write_super, /* BKL held */ + sync_fs: ext3_sync_fs, write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ unlockfs: ext3_unlockfs, /* BKL not held. We take it */ statfs: ext3_statfs, /* BKL held */ @@ -1577,24 +1580,22 @@ int ext3_force_commit(struct super_block * This implicitly triggers the writebehind on sync(). */ -static int do_sync_supers = 0; -MODULE_PARM(do_sync_supers, "i"); -MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); - void ext3_write_super (struct super_block * sb) { + if (down_trylock(&sb->s_lock) == 0) + BUG(); + sb->s_dirt = 0; + log_start_commit(EXT3_SB(sb)->s_journal, NULL); +} + +static int ext3_sync_fs(struct super_block *sb) +{ tid_t target; - if (down_trylock(&sb->s_lock) == 0) - BUG(); /* aviro detector */ sb->s_dirt = 0; target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - - if (do_sync_supers) { - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - } + log_wait_commit(EXT3_SB(sb)->s_journal, target); + return 0; } /* _ ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 3:09 ` Andrew Morton @ 2002-12-03 19:36 ` Bryan Henderson 2002-12-03 20:06 ` Andrew Morton 0 siblings, 1 reply; 21+ messages in thread From: Bryan Henderson @ 2002-12-03 19:36 UTC (permalink / raw) To: Andrew Morton Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie >--- linux-akpm/fs/buffer.c~commit-fs Mon Dec 2 18:40:03 2002 >+++ linux-akpm-akpm/fs/buffer.c Mon Dec 2 18:41:34 2002 >@@ -327,6 +327,8 @@ int fsync_super(struct super_block *sb) > lock_super(sb); > if (sb->s_dirt && sb->s_op && sb->s_op->write_super) > sb->s_op->write_super(sb); >+ if (sb->s_op && sb->s_op->sync_fs) >+ sb->s_op->sync_fs(sb); > unlock_super(sb); > unlock_kernel(); > etc. That's very close to solving a major problem I've been struggling with for a while in this area. There seems to be a big flaw, though. I have a filesystem type (STFS) that has no super block. According to the spirit of the Linux design, then, the s_dirt flag has no meaning and the filesystem driver has no business having a ->write_super routine. But of course a ->write_super call is the only evidence I have that the user did a sync(), which is something I need to know. Unfortunately, a ->write_super call can also simply mean kupdated is doing its periodic thing. So the code I had to write to implement sync(), while not looking like ext2 in a bunch of other ways, horribly abuses the interface, works differently on different versions of Linux, and in some circumstances isn't even 100% correct. A driver for the same filesystem type on AIX has no problem because AIX has an equivalent to ->sync_fs and nothing like ->write_super (AIX VFS is more general -- doesn't pretend to know about things like super blocks). So this change looks like it makes Linux as hospitable as AIX to STFS, but here's where it appears to fall short: It still uses the same VFS call to implement the sync() system call as to implement kupdated's purpose. Could there be a separate function or an argument, or do I have to continue looking at the process name to distinguish them? ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 19:36 ` Bryan Henderson @ 2002-12-03 20:06 ` Andrew Morton 2002-12-03 21:41 ` Bryan Henderson 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-03 20:06 UTC (permalink / raw) To: Bryan Henderson Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie Bryan Henderson wrote: > > ... > But of course a ->write_super call is the only evidence I have that the > user did a sync(), which is something I need to know. You and the rest of the world. > Unfortunately, a > ->write_super call can also simply mean kupdated is doing its periodic > thing. So the code I had to write to implement sync(), while not looking > like ext2 in a bunch of other ways, horribly abuses the interface, works > differently on different versions of Linux, and in some circumstances isn't > even 100% correct. > > A driver for the same filesystem type on AIX has no problem because AIX has > an equivalent to ->sync_fs and nothing like ->write_super (AIX VFS is more > general -- doesn't pretend to know about things like super blocks). > > So this change looks like it makes Linux as hospitable as AIX to STFS, but > here's where it appears to fall short: It still uses the same VFS call to > implement the sync() system call as to implement kupdated's purpose. > Could there be a separate function or an argument, or do I have to continue > looking at the process name to distinguish them? Oh darn. Yes, both kupdate and sync call sync_supers(). But kupdate should not call ->sync_fs. I'll fix that up. The intent is that ->sync_fs() should only be called for sync, unmount, etc. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 20:06 ` Andrew Morton @ 2002-12-03 21:41 ` Bryan Henderson 2002-12-03 22:13 ` Andrew Morton 0 siblings, 1 reply; 21+ messages in thread From: Bryan Henderson @ 2002-12-03 21:41 UTC (permalink / raw) To: Andrew Morton Cc: akpm, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie >The intent is that ->sync_fs() should only be >called for sync, unmount, etc. Why unmount, etc.? That's just more assumption-making by FS as to what these operations require for every filesystem type. unmount, which in general requires a rather different kind of syncing than sync(), already has its own VFS call. I don't know what etc. would be but it's probably also something a filesystem driver might want to distinguish from sync(). ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 21:41 ` Bryan Henderson @ 2002-12-03 22:13 ` Andrew Morton 2002-12-04 2:05 ` Bryan Henderson 0 siblings, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-03 22:13 UTC (permalink / raw) To: Bryan Henderson Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie Bryan Henderson wrote: > > >The intent is that ->sync_fs() should only be > >called for sync, unmount, etc. > > Why unmount, etc.? That's just more assumption-making by FS as to what > these operations require for every filesystem type. > > unmount, which in general requires a rather different kind of syncing than > sync(), already has its own VFS call. > I don't know what etc. would be but it's probably also something a > filesystem driver might want to distinguish from sync(). Well, I'd say that telling the fs about umount activity is "out of scope" for this exercise ;) It seems reasonable to have a super_op which says "sync everything to disk". And to call that in response to /bin/sync and unmount. Doesn't it? ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-03 22:13 ` Andrew Morton @ 2002-12-04 2:05 ` Bryan Henderson 2002-12-04 4:29 ` Andrew Morton 2002-12-04 21:10 ` Stephen C. Tweedie 0 siblings, 2 replies; 21+ messages in thread From: Bryan Henderson @ 2002-12-04 2:05 UTC (permalink / raw) To: Andrew Morton Cc: akpm, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie >Well, I'd say that telling the fs about umount activity is "out >of scope" for this exercise ;) Not sure what your point is here. The filesystem driver already knows when the filesystem image is being destroyed (i.e. the struct super_block is going away), which is the only umount activity I'm talking about. If destroying a filesystem image of a particular type means cached stuff has to get sync'ed, the filesystem driver knows to do it then; there's no need for a ->sync_fs call in addition to ->put_super. >It seems reasonable to have a super_op which says "sync everything >to disk". Well, maybe, but the question is when should FS call that? I say performing the sync() system call is the only time it should. If it calls it for any other reason, it is making assumptions about how the particular filesystem type works. Because "sync everything to disk" is ambiguous in the modern diverse world of filesystems (so is fsync() -- note that standards define these pretty vaguely in recognition of that), the only sensible way to define it is to say "it's what sync() does -- ask your filesystem type designer what sync() does on your filesystem." If there's any other writing of cached data to disk that needs to be done, I say the filesystem driver knows when to do it; FS should not presume to know. Of course, in ext2 and its ilk, FS understands the inner workings and is deeply involved in caching and uses the traditional ->write_super, etc. to carry it out. We're only talking about new, more general kinds of filesystems now. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 2:05 ` Bryan Henderson @ 2002-12-04 4:29 ` Andrew Morton 2002-12-04 19:00 ` Bryan Henderson 2002-12-04 21:10 ` Stephen C. Tweedie 1 sibling, 1 reply; 21+ messages in thread From: Andrew Morton @ 2002-12-04 4:29 UTC (permalink / raw) To: Bryan Henderson Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie Bryan Henderson wrote: > > >Well, I'd say that telling the fs about umount activity is "out > >of scope" for this exercise ;) > > Not sure what your point is here. The filesystem driver already knows when > the filesystem image is being destroyed (i.e. the struct super_block is > going away), which is the only umount activity I'm talking about. If > destroying a filesystem image of a particular type means cached stuff has > to get sync'ed, the filesystem driver knows to do it then; there's no need > for a ->sync_fs call in addition to ->put_super. In the context of the 2.4 kernel it is not practical to change every filesystem to sync itself at unmount. Nor in 2.5, by my hand... > >It seems reasonable to have a super_op which says "sync everything > >to disk". > > Well, maybe, but the question is when should FS call that? I say > performing the sync() system call is the only time it should. If it calls > it for any other reason, it is making assumptions about how the particular > filesystem type works. Yes, it always has. And Linux's tradition of forcing the filesystems to comply with the VFS's and VM's requirements has always been a strength. Because we can make deep changes to the VM and VFS without having to make deep changes to filesytems. Yes, this makes porting filesystems from other OS'es harder. That is tough luck. It's better this way. Filesystems which do weird stuff are a pain - they drag down core kernel development. > Because "sync everything to disk" is ambiguous in the modern diverse world > of filesystems (so is fsync() -- note that standards define these pretty > vaguely in recognition of that), the only sensible way to define it is to > say "it's what sync() does -- ask your filesystem type designer what sync() > does on your filesystem." > > If there's any other writing of cached data to disk that needs to be done, > I say the filesystem driver knows when to do it; FS should not presume to > know. The move is more toward "if the fs needs to do special stuff then it should have a hook to do that". In concrete terms, this means rather than "write your superblock now" we need to tell the fs what's happening at a higher level, and if the fs isn't interested, perform the default (ext2-style) operation. I don't think anyone has a problem with that approach. And as new filesystems are added to Linux, each one brings in its own set of new hooks. And I think that's a fine approach - we should not be speculatively adding hooks if they have no in-kernel consumers. > Of course, in ext2 and its ilk, FS understands the inner workings and is > deeply involved in caching and uses the traditional ->write_super, etc. to > carry it out. We're only talking about new, more general kinds of > filesystems now. ->write_super() seems to be a bit of a crock. It _should_ be "the VFS superblock info has changed. You should propagate that to pagecache but not start IO against it". It's not clear what it actually means at present. But for your fs, I suspect we'll need to see specific descriptions of the problems which are being encountered, and patches. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 4:29 ` Andrew Morton @ 2002-12-04 19:00 ` Bryan Henderson 2002-12-04 19:23 ` Matthew Wilcox 0 siblings, 1 reply; 21+ messages in thread From: Bryan Henderson @ 2002-12-04 19:00 UTC (permalink / raw) To: Andrew Morton Cc: linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie >In the context of the 2.4 kernel it is not practical to change every >filesystem to sync itself at unmount. > >Nor in 2.5, by my hand... I agree. They would continue to be synced at unmount time by FS like they are today. They would continue not to implement a ->fs_sync routine. I thought we were talking about expanding the VFS interface so that filesystem types for which the existing system is inadequate can still work. >Linux's tradition of forcing the filesystems >to comply with the VFS's and VM's requirements has always been a >strength. OK, well at least we found the heart of the disagreement. I consider that a weakness and favor flexibility. >Yes, this makes porting filesystems from other OS'es harder. That is >tough luck. It's better this way. Filesystems which do weird stuff >are a pain - they drag down core kernel development. It's not so much porting from other OSes -- that isn't really important. It's implementing new concepts where loading a new kernel module into an existing system is acceptable, but replacing the base kernel in that system is not. In the particular case of my filesystem type -- STFS -- that is possible with AIX, Solaris, and Windows but not Linux. And that's not because those systems were designed for the particular needs of STFS or vice versa -- it's just because their "VFS" interface is at a higher level. I'm not arguing; everyone has different goals for Linux in general and for kernel.org kernels in particular, and they're often quite different from the designers' goals for those other operating systems. I'm just exposing the big picture. >we should not be speculatively adding hooks if they have no in-kernel consumers. That policy is pretty hostile toward commercial use of Linux (which is what my job at IBM is concerned with). In the commercial world, it is generally required that you be able to add a new function to an existing system by doing something as simple as loading a new kernel module. The truth is that for STFS, it's already too late (for commercial practicality) to talk about fixing the sync problem via kernel.org kernels. I've just been using it as an example and hoping the problem could be avoided for the next novel filesystem type to come along. >->write_super() seems to be a bit of a crock. It _should_ be "the VFS >superblock info has changed. You should propagate that to pagecache >but not start IO against it". It's not clear what it actually means >at present. However one defines it (and in Linux definitions of interfaces are strictly matters of opinion), it _should_ be either an imperative interface or a declarative one; your proposal is both at the same time. An imperative interface call says, "Do X. It's none of your business why I think you should." A declarative interface says, "X is now true. Do whatever you think you should based on that information." So I would go either with "superblock info has changed" or "write your superblock into the pagecache," but not both. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 19:00 ` Bryan Henderson @ 2002-12-04 19:23 ` Matthew Wilcox 2002-12-04 22:17 ` Bryan Henderson 0 siblings, 1 reply; 21+ messages in thread From: Matthew Wilcox @ 2002-12-04 19:23 UTC (permalink / raw) To: Bryan Henderson Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie On Wed, Dec 04, 2002 at 11:00:19AM -0800, Bryan Henderson wrote: > >Linux's tradition of forcing the filesystems > >to comply with the VFS's and VM's requirements has always been a > >strength. > > OK, well at least we found the heart of the disagreement. I consider that > a weakness and favor flexibility. That's certainly a matter of opinion. I'm quite firmly on the same side as Andrew, and I offer supporting material: http://www.caldo.demon.co.uk/doc/taste.pdf (Section 3.2 is probably most germane, but it's a short paper and worth reading in full). > In the particular case of my filesystem type -- STFS -- that is > possible with AIX, Solaris, and Windows but not Linux. What is STFS, btw? > > we should not be speculatively adding hooks if they have no in-kernel > > consumers. > > That policy is pretty hostile toward commercial use of Linux (which is what > my job at IBM is concerned with). That's not true. It's hostile towards non-source extensions of Linux. That's always been abundantly clear. -- "It's not Hollywood. War is real, war is primarily not about defeat or victory, it is about death. I've seen thousands and thousands of dead bodies. Do you think I want to have an academic debate on this subject?" -- Robert Fisk ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 19:23 ` Matthew Wilcox @ 2002-12-04 22:17 ` Bryan Henderson 2002-12-05 10:36 ` Arnaldo Carvalho de Melo 0 siblings, 1 reply; 21+ messages in thread From: Bryan Henderson @ 2002-12-04 22:17 UTC (permalink / raw) To: Matthew Wilcox Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie >What is STFS, btw? It's Storage Tank File System -- a shared filesystem type that IBM has been talking about making for a couple of years. It's supposed to allow you to put a single filesystem on a large number of shared disks and have all the processes on all those systems access the filesystem as if they were all on one big system. But those disks contain only data blocks. Metadata is kept behind a server which is accessed via conventional network interfaces. Clients of that metadata server aggressively cache metadata in order to provide performance equivalent to local filesystems in usual cases. The various systems can be running various operating systems, unix and Windows alike. It would be nice if some could be Linux. Linux FS assumes in many ways that the contents of a filesystem don't change except through it. The basic FS structure has been bent (with various revalidate VFS calls) to make NFS work, but NFS does not promise (or deliver) perfect file sharing. The fact that FS presumes to be able to manage a cache of filesystem contents interferes with the STFS filesystem driver's need to coordinate file contents with other systems that see the same disks and metadata servers. Another way it has been hard to get correct STFS semantics out of Linux is that the cache the filesystem driver keeps is there whether or not you have the filesystem mounted (or mounted 10 times on the same system). So unmounting does not necessarily mean you need to sync the cache, but I still believe if a user invokes sync(), he expects all updates in that cache to be made permanent. Hence my wish for FS to have a "sync everything" VFS call, but not to call it just because it is unmounting. >> > we should not be speculatively adding hooks if they have no in-kernel >> > consumers. >> >> That policy is pretty hostile toward commercial use of Linux (which is what >> my job at IBM is concerned with). > >That's not true. It's hostile towards non-source extensions of Linux. I assume "non-source extension" means an extension that isn't distributed in the Linux kernel source tree, right? My claim is that commercial use is highly dependent on non-source extensions, so hostile toward non-source extensions means hostile toward commercial use. Commerce works by consumers getting stuff from multiple independent suppliers, who supply to multiple independent consumers. I point to traditional commercial operating systems for examples: AIX's VFS interface allows for kernel extensions supplied by companies that have no working relationship with IBM and which do things no one else was doing at the time AIX shipped. >That's always been abundantly clear. I agree. Its one of the things people cite whenever I try to include Linux in a commercial endeavor. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 22:17 ` Bryan Henderson @ 2002-12-05 10:36 ` Arnaldo Carvalho de Melo 2002-12-05 16:31 ` Stephen C. Tweedie 0 siblings, 1 reply; 21+ messages in thread From: Arnaldo Carvalho de Melo @ 2002-12-05 10:36 UTC (permalink / raw) To: Bryan Henderson Cc: Matthew Wilcox, Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen C. Tweedie Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu: > > >What is STFS, btw? > > It's Storage Tank File System -- a shared filesystem type that IBM has been > talking about making for a couple of years. It's supposed to allow you to > put a single filesystem on a large number of shared disks and have all the > processes on all those systems access the filesystem as if they were all on > one big system. But those disks contain only data blocks. Metadata is > kept behind a server which is accessed via conventional network interfaces. > Clients of that metadata server aggressively cache metadata in order to > provide performance equivalent to local filesystems > in usual cases. Alert: FS newbie talking. Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org - Arnaldo ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-05 10:36 ` Arnaldo Carvalho de Melo @ 2002-12-05 16:31 ` Stephen C. Tweedie 2002-12-05 17:24 ` girish 0 siblings, 1 reply; 21+ messages in thread From: Stephen C. Tweedie @ 2002-12-05 16:31 UTC (permalink / raw) To: Arnaldo Carvalho de Melo Cc: Bryan Henderson, Matthew Wilcox, Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen Tweedie Hi, On Thu, 2002-12-05 at 10:36, Arnaldo Carvalho de Melo wrote: > Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu: > > It's Storage Tank File System -- a shared filesystem type that IBM has been > > talking about making for a couple of years. It's supposed to allow you to > > put a single filesystem on a large number of shared disks and have all the > > processes on all those systems access the filesystem as if they were all on > > one big system. But those disks contain only data blocks. Metadata is > > kept behind a server which is accessed via conventional network interfaces. > Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org Yes, sounds very similar indeed. Cheers, Stephen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-05 16:31 ` Stephen C. Tweedie @ 2002-12-05 17:24 ` girish 0 siblings, 0 replies; 21+ messages in thread From: girish @ 2002-12-05 17:24 UTC (permalink / raw) To: Stephen C. Tweedie, Arnaldo Carvalho de Melo Cc: Bryan Henderson, Matthew Wilcox, Andrew Morton, linux-fsdevel, Chris Mason, Stephen Tweedie > Hi, > > On Thu, 2002-12-05 at 10:36, Arnaldo Carvalho de Melo wrote: > > Em Wed, Dec 04, 2002 at 02:17:40PM -0800, Bryan Henderson escreveu: > > > > It's Storage Tank File System -- a shared filesystem type that IBM has been > > > talking about making for a couple of years. It's supposed to allow you to > > > put a single filesystem on a large number of shared disks and have all the > > > processes on all those systems access the filesystem as if they were all on > > > one big system. But those disks contain only data blocks. Metadata is > > > kept behind a server which is accessed via conventional network interfaces. > > > Isn't this what Peter Braam's Lustre FS is doing? http://www.lustre.org > > Yes, sounds very similar indeed. In lustre we have network connection between client and data servers, where as STFS is similar to CXFS from SGI Regards Girish ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 2:05 ` Bryan Henderson 2002-12-04 4:29 ` Andrew Morton @ 2002-12-04 21:10 ` Stephen C. Tweedie 2002-12-04 22:46 ` Bryan Henderson 1 sibling, 1 reply; 21+ messages in thread From: Stephen C. Tweedie @ 2002-12-04 21:10 UTC (permalink / raw) To: Bryan Henderson Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen Tweedie Hi, On Wed, 2002-12-04 at 02:05, Bryan Henderson wrote: > >Well, I'd say that telling the fs about umount activity is "out > >of scope" for this exercise ;) > > Not sure what your point is here. The filesystem driver already knows when > the filesystem image is being destroyed (i.e. the struct super_block is > going away), which is the only umount activity I'm talking about. If > destroying a filesystem image of a particular type means cached stuff has > to get sync'ed, the filesystem driver knows to do it then; there's no need > for a ->sync_fs call in addition to ->put_super. Unfortunately, for 2.4 that is not true. ->put_super() cannot do the cache flush. It can't do it, because by the time we call it, the core VFS layer has already completely flushed the cache from memory, so it's too late. That's why we need a separate call to indicate the start of the umount process; put_super only marks the end of it when it's too late to do this particular piece of processing. > >It seems reasonable to have a super_op which says "sync everything > >to disk". > > Well, maybe, but the question is when should FS call that? And more importantly, when is it finished? That's the piece of the equation that's missing in the umount case. --Stephen ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] 2.5.x write_super is not for syncing 2002-12-04 21:10 ` Stephen C. Tweedie @ 2002-12-04 22:46 ` Bryan Henderson 0 siblings, 0 replies; 21+ messages in thread From: Bryan Henderson @ 2002-12-04 22:46 UTC (permalink / raw) To: Stephen C. Tweedie Cc: Andrew Morton, linux-fsdevel@vger.kernel.org, Chris Mason, Stephen Tweedie >->put_super() cannot do the >cache flush. It can't do it, because by the time we call it, the core >VFS layer has already completely flushed the cache from memory, I think this ->fs_sync call we've been talking about is something different from what I thought it was. Is this about pushing updates which are cached above the page cache into the page cache where normal page cache cleaning can effect the actual writing? If so, that's much more specialized than what I imagined -- it's applicable only to a fairly specific filesystem driver implementation. I'd probably call that something like ->fs_prepare_for_sync instead. Not that it won't get abused just like ->write_super by folks who just want to detect when someone does sync() :-) ^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2002-12-05 17:24 UTC | newest] Thread overview: 21+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2002-10-16 18:00 [PATCH] 2.5.x write_super is not for syncing Chris Mason 2002-12-02 22:07 ` Andrew Morton 2002-12-02 23:40 ` Chris Mason 2002-12-03 0:03 ` Andrew Morton 2002-12-03 1:10 ` Chris Mason 2002-12-03 1:40 ` Andrew Morton 2002-12-03 3:09 ` Andrew Morton 2002-12-03 19:36 ` Bryan Henderson 2002-12-03 20:06 ` Andrew Morton 2002-12-03 21:41 ` Bryan Henderson 2002-12-03 22:13 ` Andrew Morton 2002-12-04 2:05 ` Bryan Henderson 2002-12-04 4:29 ` Andrew Morton 2002-12-04 19:00 ` Bryan Henderson 2002-12-04 19:23 ` Matthew Wilcox 2002-12-04 22:17 ` Bryan Henderson 2002-12-05 10:36 ` Arnaldo Carvalho de Melo 2002-12-05 16:31 ` Stephen C. Tweedie 2002-12-05 17:24 ` girish 2002-12-04 21:10 ` Stephen C. Tweedie 2002-12-04 22:46 ` Bryan Henderson
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.