linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Valerie Aurora <val@vaaconsulting.com>
To: linux-fsdevel@vger.kernel.org, Jan Kara <jack@suse.cz>,
	Dave Chinner <david@fromorbit.com>,
	Masayoshi MIZUMA <m.mizuma@jp.fujitsu.com>,
	Greg Freemyer <greg.freemyer@gmail.com>,
	linux
Subject: [RFC PATCH 1/3] VFS: Fix s_umount thaw/write deadlock
Date: Mon, 12 Sep 2011 19:57:11 -0700	[thread overview]
Message-ID: <CAD-Xujm87Aj6y09GRXPVMZM+9N_7wyte-YNp7t0Wi3R2qH2mHQ@mail.gmail.com> (raw)
In-Reply-To: <CAD-Xujmjai2evYC4a4CdNPtjh_9xhbELcZePtGSS54VPRGpznA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: 1 --]
[-- Type: application/octet-stream, Size: 8828 bytes --]

commit 043b1284c113cfbaa6a175ad21f369cf59215c71
Author: Valerie Aurora <val@vaaconsulting.com>
Date:   Fri Aug 26 20:32:57 2011 -0400

    VFS: Fix s_umount thaw/write deadlock
    
    File system freeze/thaw require the superblock's s_umount lock.
    However, we write to the file system while holding the s_umount lock
    in several places (e.g., writeback and quotas).  Any of these can
    block on a frozen file system while holding s_umount, preventing any
    later thaw from occurring, since thaw requires s_umount.
    
    The solution is to add a check, vfs_is_frozen(), to all code paths
    that write while holding sb->s_umount and return without writing if it
    is true.

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055..90f8f7e 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -57,6 +57,14 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (ret)
 		return ret;
+
+	/*
+	 * Any file-system specific routines eventually called by
+	 * drop_pagecache_sb() and drop_slab() ought to check for a
+	 * frozen file system before writing to the disk.  Most file
+	 * systems won't write in this case but XFS is a notable
+	 * exception in certain cases.
+	 */
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			iterate_supers(drop_pagecache_sb, NULL);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b9..d1dca03 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -537,6 +537,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 	long write_chunk;
 	long wrote = 0;  /* count both pages and inodes */
 
+	if (vfs_is_frozen(sb))
+		return 0;
+
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
 
@@ -1238,39 +1241,43 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * writeback_inodes_sb_if_idle	-	start writeback if none underway
  * @sb: the superblock
  *
- * Invoke writeback_inodes_sb if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke writeback_inodes_sb if no writeback is currently underway
+ * and no one else holds the s_umount lock.  Returns 1 if writeback
+ * was started, 0 if not.
  */
 int writeback_inodes_sb_if_idle(struct super_block *sb)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
-		down_read(&sb->s_umount);
-		writeback_inodes_sb(sb);
-		up_read(&sb->s_umount);
-		return 1;
-	} else
-		return 0;
+		if (down_read_trylock(&sb->s_umount)) {
+			writeback_inodes_sb(sb);
+			up_read(&sb->s_umount);
+			return 1;
+		}
+	}
+	return 0;
 }
 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 
 /**
- * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * writeback_inodes_sb_nr_if_idle	-	start writeback if none underway
  * @sb: the superblock
  * @nr: the number of pages to write
  *
- * Invoke writeback_inodes_sb if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke writeback_inodes_sb if no writeback is currently underway
+ * and no one else holds the s_umount lock.  Returns 1 if writeback
+ * was started, 0 if not.
  */
 int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
 				   unsigned long nr)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
-		down_read(&sb->s_umount);
-		writeback_inodes_sb_nr(sb, nr);
-		up_read(&sb->s_umount);
-		return 1;
-	} else
-		return 0;
+		if (down_read_trylock(&sb->s_umount)) {
+			writeback_inodes_sb_nr(sb, nr);
+			up_read(&sb->s_umount);
+			return 1;
+		}
+	}
+	return 0;
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
 
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb2..993ce22 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -47,7 +47,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 
 static void quota_sync_one(struct super_block *sb, void *arg)
 {
-	if (sb->s_qcop && sb->s_qcop->quota_sync)
+	if (sb->s_qcop && sb->s_qcop->quota_sync && !vfs_is_frozen(sb))
 		sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
 }
 
@@ -366,6 +366,18 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
 
+	/*
+	 * It's not clear which quota functions may write to the file
+	 * system (all?).  Check for a frozen fs and bail out now.
+	 */
+	if (vfs_is_frozen(sb)) {
+		drop_super(sb);
+		/* XXX Should quotactl_block() error path do this too? */
+		if (pathp && !IS_ERR(pathp))
+			path_put(pathp);
+		return -EBUSY;
+	}
+
 	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
 	drop_super(sb);
diff --git a/fs/super.c b/fs/super.c
index 3f56a26..11ef978 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -526,6 +526,10 @@ void sync_supers(void)
  *
  *	Scans the superblock list and calls given function, passing it
  *	locked superblock and given argument.
+ *
+ *	Note: If @f is going to write to the superblock, it must first
+ *	check if the file system is frozen (via vfs_is_frozen(sb)) and
+ *	refuse to write if so.
  */
 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 {
@@ -561,6 +565,10 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
  *
  *	Scans the superblock list and calls given function, passing it
  *	locked superblock and given argument.
+ *
+ *	Note: If @f is going to write to the superblock, it must first
+ *	check if the file system is frozen (via vfs_is_frozen(sb)) and
+ *	refuse to write if so.
  */
 void iterate_supers_type(struct file_system_type *type,
 	void (*f)(struct super_block *, void *), void *arg)
@@ -595,6 +603,10 @@ EXPORT_SYMBOL(iterate_supers_type);
  *	
  *	Scans the superblock list and finds the superblock of the file system
  *	mounted on the device given. %NULL is returned if no match is found.
+ *
+ *	Note: If caller is going to write to the superblock, it must first
+ *	check if the file system is frozen (via vfs_is_frozen(sb)) and
+ *	refuse to write if so.
  */
 
 struct super_block *get_super(struct block_device *bdev)
@@ -1127,6 +1139,23 @@ out:
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
  * freeze_fs.  Subsequent calls to this without first thawing the fs will return
  * -EBUSY.
+ *
+ * During this function, sb->s_frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen
+ * and out-standing writes are being written out.  Writes that
+ * complete in-process writes should be permitted but new ones should
+ * be blocked.
+ *
+ * SB_FREEZE_TRANS: The file system is frozen.  No writes are
+ * permitted to any part.
+ *
+ * sb->s_frozen is protected by sb->s_umount.  Additionally,
+ * SB_FREEZE_WRITE is only temporarily set during freeze/thaw while
+ * holding sb->s_umount for writing, so any other callers holding
+ * sb->s_umount will never see this state.
  */
 int freeze_super(struct super_block *sb)
 {
diff --git a/fs/sync.c b/fs/sync.c
index c98a747..db15b11 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -68,7 +68,7 @@ int sync_filesystem(struct super_block *sb)
 	/*
 	 * No point in syncing out anything if the filesystem is read-only.
 	 */
-	if (sb->s_flags & MS_RDONLY)
+	if ((sb->s_flags & MS_RDONLY) && !vfs_is_frozen(sb))
 		return 0;
 
 	ret = __sync_filesystem(sb, 0);
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_one_sb(struct super_block *sb, void *arg)
 {
-	if (!(sb->s_flags & MS_RDONLY))
+	if (!(sb->s_flags & MS_RDONLY) && !vfs_is_frozen(sb))
 		__sync_filesystem(sb, *(int *)arg);
 }
 /*
@@ -136,7 +136,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct file *file;
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
 	int fput_needed;
 
 	file = fget_light(fd, &fput_needed);
@@ -145,7 +145,8 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 	sb = file->f_dentry->d_sb;
 
 	down_read(&sb->s_umount);
-	ret = sync_filesystem(sb);
+	if (!(sb->s_flags & MS_RDONLY) && !vfs_is_frozen(sb))
+		ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
 
 	fput_light(file, fput_needed);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f23bcb7..d23ab4f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1457,7 +1457,7 @@ extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*
- * Snapshotting support.
+ * Snapshotting support.  See freeze_super() for documentation.
  */
 enum {
 	SB_UNFROZEN = 0,
@@ -1468,6 +1468,10 @@ enum {
 #define vfs_check_frozen(sb, level) \
 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
 
+static inline int vfs_is_frozen(struct super_block *sb) {
+	return sb->s_frozen == SB_FREEZE_TRANS;
+}
+
 /*
  * until VFS tracks user namespaces for inodes, just make all files
  * belong to init_user_ns

  reply	other threads:[~2011-09-13  2:57 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-09-13  2:53 [RFC PATCH 0/3] VFS: Fix s_umount thaw/write deadlock Valerie Aurora
2011-09-13  2:57 ` Valerie Aurora [this message]
2011-09-14 14:00   ` [RFC PATCH 1/3] " Jan Kara
2011-09-14 23:53     ` Valerie Aurora
2011-09-15 16:22       ` Jan Kara
2011-09-18 23:25       ` Dave Chinner
2011-09-20 22:51   ` Christoph Hellwig
2011-09-14 13:05 ` [RFC PATCH 0/3] " Jan Kara
2011-09-14 23:22   ` Valerie Aurora
2011-10-27 21:39 ` Christoph Hellwig
2011-10-27 22:08   ` Valerie Aurora
2011-10-30  0:59     ` Valerie Aurora
2011-11-28 20:58       ` Valerie Aurora

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAD-Xujm87Aj6y09GRXPVMZM+9N_7wyte-YNp7t0Wi3R2qH2mHQ@mail.gmail.com \
    --to=val@vaaconsulting.com \
    --cc=david@fromorbit.com \
    --cc=greg.freemyer@gmail.com \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=m.mizuma@jp.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).