linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <npiggin@kernel.dk>
To: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Subject: [patch 7/8] fs: fix or note I_DIRTY handling bugs in filesystems
Date: Sat, 18 Dec 2010 12:46:41 +1100	[thread overview]
Message-ID: <20101218015117.652706340@kernel.dk> (raw)
In-Reply-To: 20101218014634.943276411@kernel.dk

[-- Attachment #1: fs-fix-dirty-flags.patch --]
[-- Type: text/plain, Size: 11209 bytes --]

Checking I_DIRTY* bits is racy unless we're under I_SYNC, because there
might be a concurrent writeback thread that has cleared I_DIRTY, but
has not yet completed the ->write_inode call.

This means our (typically integrity/fsync) operation can finish before
previously dirty data has been written safely to disk.

Solve it by exporting inode_writeback_begin/end to filesystems, and have
them use that before checking dirty flags, where it matters.

I'm not thrilled at exporting inode_lock, however I didn't see a good way
to make that code fully generic (eg. some cases want to avoid taking
and dropping locks if there is nothing to do, future users might want to
be smarter about dirty bits and keep their own inode dirty bits in synch
under the same lock, etc).

Not signed off yet

Index: linux-2.6/drivers/staging/pohmelfs/inode.c
===================================================================
--- linux-2.6.orig/drivers/staging/pohmelfs/inode.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/drivers/staging/pohmelfs/inode.c	2010-12-18 03:04:10.000000000 +1100
@@ -373,6 +373,7 @@ static int pohmelfs_write_inode_create_c
 		dprintk("%s: parent: %llu, ino: %llu, inode: %p.\n",
 				__func__, parent->ino, n->ino, inode);
 
+		/* XXX: is this race WRT writeback? */
 		if (inode && (inode->i_state & I_DIRTY)) {
 			struct pohmelfs_inode *pi = POHMELFS_I(inode);
 			pohmelfs_write_create_inode(pi);
Index: linux-2.6/fs/gfs2/file.c
===================================================================
--- linux-2.6.orig/fs/gfs2/file.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/gfs2/file.c	2010-12-18 03:04:10.000000000 +1100
@@ -557,7 +557,7 @@ static int gfs2_close(struct inode *inod
 static int gfs2_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
-	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+	unsigned dirty, mask;
 	int ret = 0;
 
 	if (gfs2_is_jdata(GFS2_I(inode))) {
@@ -565,13 +565,35 @@ static int gfs2_fsync(struct file *file,
 		return 0;
 	}
 
-	if (sync_state != 0) {
-		if (!datasync)
-			ret = write_inode_now(inode, 0);
+	spin_lock(&inode_lock);
+	inode_writeback_begin(inode, 1);
 
-		if (gfs2_is_stuffed(GFS2_I(inode)))
-			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+	if (datasync)
+		mask = I_DIRTY_DATASYNC;
+	else
+		mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+	dirty = inode->i_state & mask;
+	inode->i_state &= ~mask;
+	if (dirty) {
+		spin_unlock(&inode_lock);
+
+		if (!datasync) {
+			struct writeback_control wbc = {
+				.sync_mode = WB_SYNC_ALL,
+			};
+			ret = inode->i_sb->s_op->write_inode(inode, &wbc);
+		} else {
+			if (gfs2_is_stuffed(GFS2_I(inode)))
+				gfs2_log_flush(GFS2_SB(inode),
+						GFS2_I(inode)->i_gl);
+		}
+
+		spin_lock(&inode_lock);
 	}
+	if (ret)
+		inode->i_state |= dirty;
+	inode_writeback_end(inode);
+	spin_unlock(&inode_lock);
 
 	return ret;
 }
Index: linux-2.6/fs/jffs2/fs.c
===================================================================
--- linux-2.6.orig/fs/jffs2/fs.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/jffs2/fs.c	2010-12-18 03:04:10.000000000 +1100
@@ -361,6 +361,7 @@ void jffs2_dirty_inode(struct inode *ino
 {
 	struct iattr iattr;
 
+	/* XXX: huh? How does this make sense? */
 	if (!(inode->i_state & I_DIRTY_DATASYNC)) {
 		D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino));
 		return;
Index: linux-2.6/fs/jfs/file.c
===================================================================
--- linux-2.6.orig/fs/jfs/file.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/jfs/file.c	2010-12-18 03:04:10.000000000 +1100
@@ -19,6 +19,7 @@
 
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
@@ -31,18 +32,34 @@
 int jfs_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
-	int rc = 0;
+	unsigned dirty, mask;
+	int err = 0;
 
-	if (!(inode->i_state & I_DIRTY) ||
-	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+	spin_lock(&inode_lock);
+	inode_writeback_begin(inode, 1);
+
+	if (datasync)
+		mask = I_DIRTY_DATASYNC;
+	else
+		mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+	dirty = inode->i_state & mask;
+	inode->i_state &= ~mask;
+	spin_unlock(&inode_lock);
+
+	if (!dirty) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
-		return rc;
+	} else {
+		err = jfs_commit_inode(inode, 1);
 	}
 
-	rc |= jfs_commit_inode(inode, 1);
+	spin_lock(&inode_lock);
+	if (err)
+		inode->i_state |= dirty;
+	inode_writeback_end(inode);
+	spin_unlock(&inode_lock);
 
-	return rc ? -EIO : 0;
+	return err ? -EIO : 0;
 }
 
 static int jfs_open(struct inode *inode, struct file *file)
Index: linux-2.6/fs/nfsd/vfs.c
===================================================================
--- linux-2.6.orig/fs/nfsd/vfs.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/nfsd/vfs.c	2010-12-18 03:04:10.000000000 +1100
@@ -969,10 +969,9 @@ static int wait_for_concurrent_writes(st
 		dprintk("nfsd: write resume %d\n", task_pid_nr(current));
 	}
 
-	if (inode->i_state & I_DIRTY) {
-		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-		err = vfs_fsync(file, 0);
-	}
+	dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+	err = vfs_fsync(file, 0);
+
 	last_ino = inode->i_ino;
 	last_dev = inode->i_sb->s_dev;
 	return err;
Index: linux-2.6/fs/ocfs2/file.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/file.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/ocfs2/file.c	2010-12-18 03:04:10.000000000 +1100
@@ -176,12 +176,24 @@ static int ocfs2_sync_file(struct file *
 	journal_t *journal;
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned dirty, mask;
 
 	mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
 		   file->f_path.dentry, file->f_path.dentry->d_name.len,
 		   file->f_path.dentry->d_name.name);
 
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+	spin_lock(&inode_lock);
+	inode_writeback_begin(inode, 1);
+	if (datasync)
+		mask = I_DIRTY_DATASYNC;
+	else
+		mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+	dirty = inode->i_state & mask;
+	inode->i_state &= ~mask;
+	spin_unlock(&inode_lock);
+
+	if (datasync && dirty) {
+
 		/*
 		 * We still have to flush drive's caches to get data to the
 		 * platter
@@ -195,6 +207,12 @@ static int ocfs2_sync_file(struct file *
 	err = jbd2_journal_force_commit(journal);
 
 bail:
+	spin_lock(&inode_lock);
+	if (err)
+		inode->i_state |= dirty;
+	inode_writeback_end(inode);
+	spin_unlock(&inode_lock);
+
 	mlog_exit(err);
 
 	return (err < 0) ? -EIO : 0;
Index: linux-2.6/fs/ubifs/file.c
===================================================================
--- linux-2.6.orig/fs/ubifs/file.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/ubifs/file.c	2010-12-18 03:04:10.000000000 +1100
@@ -1313,11 +1313,9 @@ int ubifs_fsync(struct file *file, int d
 	 * VFS has already synchronized dirty pages for this inode. Synchronize
 	 * the inode unless this is a 'datasync()' call.
 	 */
-	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
-		err = inode->i_sb->s_op->write_inode(inode, NULL);
-		if (err)
-			return err;
-	}
+	err = sync_inode_metadata(inode, datasync, 1);
+	if (err)
+		return err;
 
 	/*
 	 * Nodes related to this inode may still sit in a write-buffer. Flush
Index: linux-2.6/fs/ufs/truncate.c
===================================================================
--- linux-2.6.orig/fs/ufs/truncate.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/ufs/truncate.c	2010-12-18 03:04:10.000000000 +1100
@@ -479,7 +479,7 @@ int ufs_truncate(struct inode *inode, lo
 		retry |= ufs_trunc_tindirect (inode);
 		if (!retry)
 			break;
-		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+		if (IS_SYNC(inode))
 			ufs_sync_inode (inode);
 		blk_run_address_space(inode->i_mapping);
 		yield();
Index: linux-2.6/fs/xfs/linux-2.6/xfs_file.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_file.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c	2010-12-18 03:51:34.000000000 +1100
@@ -99,6 +99,7 @@ xfs_file_fsync(
 	struct xfs_trans	*tp;
 	int			error = 0;
 	int			log_flushed = 0;
+	unsigned		dirty, mask;
 
 	trace_xfs_file_fsync(ip);
 
@@ -110,6 +111,25 @@ xfs_file_fsync(
 	xfs_ioend_wait(ip);
 
 	/*
+	 * First check if the VFS inode is marked dirty.  All the dirtying
+	 * of non-transactional updates no goes through mark_inode_dirty*,
+	 * which allows us to distinguish beteeen pure timestamp updates
+	 * and i_size updates which need to be caught for fdatasync.
+	 * After that also theck for the dirty state in the XFS inode, which
+	 * might gets cleared when the inode gets written out via the AIL
+	 * or xfs_iflush_cluster.
+	 */
+	spin_lock(&inode_lock);
+	inode_writeback_begin(inode, 1);
+	if (datasync)
+		mask = I_DIRTY_DATASYNC;
+	else
+		mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+	dirty = inode->i_state & mask;
+	inode->i_state &= ~mask;
+	spin_unlock(&inode_lock);
+
+	/*
 	 * We always need to make sure that the required inode state is safe on
 	 * disk.  The inode might be clean but we still might need to force the
 	 * log because of committed transactions that haven't hit the disk yet.
@@ -123,18 +143,7 @@ xfs_file_fsync(
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	/*
-	 * First check if the VFS inode is marked dirty.  All the dirtying
-	 * of non-transactional updates no goes through mark_inode_dirty*,
-	 * which allows us to distinguish beteeen pure timestamp updates
-	 * and i_size updates which need to be caught for fdatasync.
-	 * After that also theck for the dirty state in the XFS inode, which
-	 * might gets cleared when the inode gets written out via the AIL
-	 * or xfs_iflush_cluster.
-	 */
-	if (((inode->i_state & I_DIRTY_DATASYNC) ||
-	    ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
-	    ip->i_update_core) {
+	if (dirty && ip->i_update_core) {
 		/*
 		 * Kick off a transaction to log the inode core to get the
 		 * updates.  The sync transaction will also force the log.
@@ -145,7 +154,7 @@ xfs_file_fsync(
 				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
-			return -error;
+			goto out;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
@@ -197,6 +206,13 @@ xfs_file_fsync(
 			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
 	}
 
+out:
+	spin_lock(&inode_lock);
+	if (error)
+		inode->i_state |= dirty;
+	inode_writeback_end(inode);
+	spin_unlock(&inode_lock);
+
 	return -error;
 }
 
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-12-18 03:03:37.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-12-18 03:04:10.000000000 +1100
@@ -82,6 +82,7 @@ static struct hlist_head *inode_hashtabl
  * the i_state of an inode while it is in use..
  */
 DEFINE_SPINLOCK(inode_lock);
+EXPORT_SYMBOL(inode_lock);
 
 /*
  * iprune_sem provides exclusion between the kswapd or try_to_free_pages



  parent reply	other threads:[~2010-12-18  2:02 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-12-18  1:46 [patch 0/8] Inode data integrity patches Nick Piggin
2010-12-18  1:46 ` [patch 1/8] fs: mark_inode_dirty barrier fix Nick Piggin
2010-12-18  1:46 ` [patch 2/8] fs: simple fsync race fix Nick Piggin
2010-12-18  1:46 ` [patch 3/8] fs: introduce inode writeback helpers Nick Piggin
2010-12-18  1:46 ` [patch 4/8] fs: preserve inode dirty bits on failed metadata writeback Nick Piggin
2010-12-18  1:46 ` [patch 5/8] fs: ext2 inode sync fix Nick Piggin
2011-01-07 19:08   ` Ted Ts'o
2010-12-18  1:46 ` [patch 6/8] fs: fsync optimisations Nick Piggin
2010-12-18  1:46 ` Nick Piggin [this message]
2010-12-29 15:01   ` [patch 7/8] fs: fix or note I_DIRTY handling bugs in filesystems Christoph Hellwig
2011-01-03 15:03     ` Steven Whitehouse
2011-01-03 16:58       ` Christoph Hellwig
2011-01-04  7:12         ` Nick Piggin
2011-01-04 14:22         ` Steven Whitehouse
2011-01-04  6:04     ` Nick Piggin
2011-01-04  6:39       ` Christoph Hellwig
2011-01-04  7:52         ` Nick Piggin
2011-01-04  9:13           ` Christoph Hellwig
2011-01-04  9:28             ` Nick Piggin
2010-12-18  1:46 ` [patch 8/8] fs: add i_op->sync_inode Nick Piggin
2010-12-29 15:12   ` Christoph Hellwig
2011-01-04  6:27     ` Nick Piggin
2011-01-04  6:57       ` Christoph Hellwig
2011-01-04  8:03         ` Nick Piggin
2011-01-04  8:31           ` Nick Piggin
2011-01-04  9:25             ` Christoph Hellwig
2011-01-04  9:52               ` Nick Piggin
2011-01-06 20:49                 ` Christoph Hellwig
2011-01-07  4:48                   ` Nick Piggin
2011-01-07  7:25                     ` Christoph Hellwig
2011-01-11  3:44                       ` Nick Piggin
2011-01-04  9:25           ` Christoph Hellwig
2011-01-04  9:49             ` Nick Piggin
2011-01-06 20:45               ` Christoph Hellwig
2011-01-07  4:47                 ` Nick Piggin
2011-01-07  7:24                   ` Christoph Hellwig
2011-01-07  7:29                     ` Christoph Hellwig
2011-01-07 13:10                       ` Christoph Hellwig
2011-01-07 18:30                       ` Ted Ts'o
2011-01-07 18:32                         ` Christoph Hellwig
2011-01-07 19:06   ` Ted Ts'o

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101218015117.652706340@kernel.dk \
    --to=npiggin@kernel.dk \
    --cc=akpm@linux-foundation.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).