From: Dmitry Monakhov <dmonakhov@openvz.org>
To: linux-fsdevel@vger.kernel.org
Cc: viro@ZenIV.linux.org.uk, Dmitry Monakhov <dmonakhov@openvz.org>
Subject: [PATCH 3/3] vfs: fix filesystem_sync vs write race on rw=>ro remount v2
Date: Sat, 27 Feb 2010 03:25:49 +0300 [thread overview]
Message-ID: <1267230349-8192-3-git-send-email-dmonakhov@openvz.org> (raw)
In-Reply-To: <1267230349-8192-2-git-send-email-dmonakhov@openvz.org>
Currently on rw=>ro remount we have following race
| mount /mnt -oremount,ro | write-task |
|-------------------------+------------|
| | open(RDWR) |
| shrink_dcache_sb(sb); | |
| sync_filesystem(sb); | |
| | write() |
| | close() |
| fs_may_remount_ro(sb) | |
| sb->s_flags = new_flags | |
Later writeback or sync() will result in error due to MS_RDONLY flag
In case of ext4 this result in jbd2_start failure on writeback
LOG: ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30
This patch fix the issue like follows:
1) Introduce ST_REMOUNT_RO bit with will be set on remount start
This bit will be cleared if new writers appear.
2) Redesign fs_may_remount_ro. Now it is just calculate sum of
corresponding vfsmounts
3) The rest is simple, we just perform sync and check than no new
writers appear.
##TESTCASE_BEGIN:
#! /bin/bash -x
DEV=/dev/sdb5
FSTYPE=ext4
BINDIR=/home/dmon
MNTOPT="data=ordered"
umount /mnt
mkfs.${FSTYPE} ${DEV} || exit 1
mount ${DEV} /mnt -o${MNTOPT} || exit 1
${BINDIR}/fsstress -p1 -l999999999 -n9999999999 -d /mnt/test &
sleep 15
mount /mnt -oremount,ro,${MNTOPT}
sleep 1
killall -9 fsstress
sync
# after this you may get following message in dmesg
# "ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30"
##TESTCASE_END
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
fs/file_table.c | 24 ------------------------
fs/namespace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
fs/super.c | 33 +++++++++++++++++++++++++++------
include/linux/fs.h | 1 +
4 files changed, 72 insertions(+), 34 deletions(-)
diff --git a/fs/file_table.c b/fs/file_table.c
index b98404b..32d37af 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -348,30 +348,6 @@ void file_kill(struct file *file)
}
}
-int fs_may_remount_ro(struct super_block *sb)
-{
- struct file *file;
-
- /* Check that no files are currently opened for writing. */
- file_list_lock();
- list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
- struct inode *inode = file->f_path.dentry->d_inode;
-
- /* File with pending delete? */
- if (inode->i_nlink == 0)
- goto too_bad;
-
- /* Writeable file? */
- if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
- goto too_bad;
- }
- file_list_unlock();
- return 1; /* Tis' cool bro. */
-too_bad:
- file_list_unlock();
- return 0;
-}
-
/**
* mark_files_ro - mark all files read-only
* @sb: superblock in question
diff --git a/fs/namespace.c b/fs/namespace.c
index e816097..bc79f1f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -277,6 +277,13 @@ int mnt_want_write(struct vfsmount *mnt)
dec_mnt_writers(mnt);
ret = -EROFS;
goto out;
+ } else {
+ /*
+ * Clear ST_REMOUNT_RO flag to let remount task know
+ * about new writers.
+ */
+ if (unlikely(test_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state)))
+ clear_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state);
}
out:
preempt_enable();
@@ -341,11 +348,10 @@ void mnt_drop_write(struct vfsmount *mnt)
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-static int mnt_make_readonly(struct vfsmount *mnt)
+static int mnt_check_writers(struct vfsmount *mnt, int set_ro)
{
int ret = 0;
- spin_lock(&vfsmount_lock);
mnt->mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -371,14 +377,23 @@ static int mnt_make_readonly(struct vfsmount *mnt)
*/
if (count_mnt_writers(mnt) > 0)
ret = -EBUSY;
- else
- mnt->mnt_flags |= MNT_READONLY;
+ else {
+ if (likely(set_ro))
+ mnt->mnt_flags |= MNT_READONLY;
+ }
/*
* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ return ret;
+}
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret;
+ spin_lock(&vfsmount_lock);
+ ret = mnt_check_writers(mnt, 1);
spin_unlock(&vfsmount_lock);
return ret;
}
@@ -389,6 +404,31 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
mnt->mnt_flags &= ~MNT_READONLY;
spin_unlock(&vfsmount_lock);
}
+/**
+ * Check whenever is it possible to remount given sb to readonly.
+ * @sb : super block in question
+ *
+ * Caller is responsible to set ST_REMOUNT_RO state before the call.
+ */
+int fs_may_remount_ro(struct super_block *sb)
+{
+ struct vfsmount *mnt;
+ int ret = 1;
+ spin_lock(&vfsmount_lock);
+ list_for_each_entry(mnt, &sb->s_vfsmount, mnt_sb_list) {
+ ret = !mnt_check_writers(mnt, 0);
+ if (ret)
+ break;
+ }
+ spin_unlock(&vfsmount_lock);
+ /*
+ * If new writer appears after we have checked all vfsmounts.
+ * Then ST_REMOUNT_RO bit will be cleared.
+ */
+ if (!test_bit(ST_REMOUNT_RO, &sb->s_state))
+ ret = 0;
+ return ret;
+}
void super_add_mnt(struct super_block *sb, struct vfsmount *mnt)
{
diff --git a/fs/super.c b/fs/super.c
index d3e0083..8cf148b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -571,6 +571,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
int retval;
int remount_rw, remount_ro;
+ remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+ remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
+
if (sb->s_frozen != SB_UNFROZEN)
return -EBUSY;
@@ -581,18 +584,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
if (flags & MS_RDONLY)
acct_auto_close(sb);
+ if (remount_ro) {
+ if (force)
+ mark_files_ro(sb);
+ /*
+ * Store ST_REMOUNT_RO flag. New writers (in any) will clrear
+ * this bit.
+ */
+ set_bit(ST_REMOUNT_RO, &sb->s_state);
+ /*
+ * This store should be visible before we do.
+ */
+ smp_mb();
+ /*
+ * To prevent sync vs write race condition we have to check
+ * writers before filesystem sync.
+ */
+ if (!fs_may_remount_ro(sb))
+ return -EBUSY;
+ }
shrink_dcache_sb(sb);
sync_filesystem(sb);
- remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
- remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
/* If we are remounting RDONLY and current sb is read/write,
- make sure there are no rw files opened */
+ make sure there are no new rw files opened */
if (remount_ro) {
- if (force)
- mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
+ if (!fs_may_remount_ro(sb))
return -EBUSY;
retval = vfs_dq_off(sb, 1);
if (retval < 0 && retval != -ENOSYS)
@@ -617,6 +635,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
*/
if (remount_ro && sb->s_bdev)
invalidate_bdev(sb->s_bdev);
+
+ WARN_ON(remount_ro && !fs_may_remount_ro(sb));
+ clear_bit(ST_REMOUNT_RO, &sb->s_state);
return 0;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75f057d..0ad7aa4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1308,6 +1308,7 @@ extern int send_sigurg(struct fown_struct *fown);
enum {
ST_FS_SYNC, /* fssync is about to happen */
+ ST_REMOUNT_RO, /* readonly remount is in progress */
};
extern struct list_head super_blocks;
extern spinlock_t sb_lock;
--
1.6.6
next prev parent reply other threads:[~2010-02-27 0:25 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-02-27 0:25 [PATCH 1/3] vfs: redesign sb state flags Dmitry Monakhov
2010-02-27 0:25 ` [PATCH 2/3] vfs: per-sb write count preparation stage Dmitry Monakhov
2010-02-27 0:25 ` Dmitry Monakhov [this message]
2010-03-02 13:29 ` [PATCH 3/3] vfs: fix filesystem_sync vs write race on rw=>ro remount v2 Jan Kara
2010-03-02 14:04 ` Dmitry Monakhov
2010-03-02 14:06 ` Jan Kara
2010-03-02 14:24 ` Dmitry Monakhov
2010-03-02 14:35 ` Jan Kara
2010-03-02 15:38 ` Dmitry Monakhov
2010-03-02 15:21 ` Nick Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1267230349-8192-3-git-send-email-dmonakhov@openvz.org \
--to=dmonakhov@openvz.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=viro@ZenIV.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).