All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
	akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
	Jan Kara <jack@suse.cz>, "Theodore Tso" <tytso@mit.edu>,
	Greg Kroah-Hartman <gregkh@suse.de>
Subject: [32/34] ext4: Wait for proper transaction commit on fsync
Date: Thu, 10 Dec 2009 21:23:44 -0800	[thread overview]
Message-ID: <20091211052557.723287400@linux.site> (raw)
In-Reply-To: <20091211052858.GA23229@kroah.com>

[-- Attachment #1: 0028-ext4-Wait-for-proper-transaction-commit-on-fsync.patch --]
[-- Type: text/plain, Size: 7950 bytes --]

2.6.32-stable review patch.  If anyone has any objections, please let us know.

------------------

(cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645)

We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/ext4.h      |    7 +++++++
 fs/ext4/ext4_jbd2.h |   13 +++++++++++++
 fs/ext4/extents.c   |   14 ++++++++++++--
 fs/ext4/fsync.c     |   46 +++++++++++++++++-----------------------------
 fs/ext4/inode.c     |   29 +++++++++++++++++++++++++++++
 fs/ext4/super.c     |    2 ++
 fs/jbd2/journal.c   |    1 +
 7 files changed, 81 insertions(+), 31 deletions(-)

--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -703,6 +703,13 @@ struct ext4_inode_info {
 	struct list_head i_aio_dio_complete_list;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
 };
 
 /*
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h
 	return 0;
 }
 
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3064,6 +3064,8 @@ ext4_ext_handle_uninitialized_extents(ha
 	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
 		ret = ext4_convert_unwritten_extents_dio(handle, inode,
 							path);
+		if (ret >= 0)
+			ext4_update_inode_fsync_trans(handle, inode, 1);
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3091,6 +3093,8 @@ ext4_ext_handle_uninitialized_extents(ha
 	ret = ext4_ext_convert_to_initialized(handle, inode,
 						path, iblock,
 						max_blocks);
+	if (ret >= 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3329,10 +3333,16 @@ int ext4_ext_get_blocks(handle_t *handle
 	allocated = ext4_ext_get_actual_len(&newex);
 	set_buffer_new(bh_result);
 
-	/* Cache only when it is _not_ an uninitialized extent */
-	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an uninitialized extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	} else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int err, ret = 0;
+	int ret;
+	tid_t commit_tid;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file(file, dentry, datasync);
 
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return 0;
+
 	ret = flush_aio_dio_completed_IO(inode);
 	if (ret < 0)
 		return ret;
+
+	if (!journal)
+		return simple_fsync(file, dentry, datasync);
+
 	/*
-	 * data=writeback:
+	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  sync_inode() will sync the metadata
-	 *
-	 * data=ordered:
-	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st
 	if (ext4_should_journal_data(inode))
 		return ext4_force_commit(inode->i_sb);
 
-	if (!journal)
-		ret = sync_mapping_buffers(inode->i_mapping);
-
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
-
-	/*
-	 * The VFS has written the file data.  If the inode is unaltered
-	 * then we need not start a commit.
-	 */
-	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* sys_fsync did this */
-		};
-		err = sync_inode(inode, &wbc);
-		if (ret == 0)
-			ret = err;
-	}
-out:
-	if (journal && (journal->j_flags & JBD2_BARRIER))
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (jbd2_log_start_commit(journal, commit_tid))
+		jbd2_log_wait_commit(journal, commit_tid);
+	else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1025,6 +1025,8 @@ static int ext4_ind_get_blocks(handle_t
 		goto cleanup;
 
 	set_buffer_new(bh_result);
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
@@ -4794,6 +4796,7 @@ struct inode *ext4_iget(struct super_blo
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 
@@ -4858,6 +4861,31 @@ struct inode *ext4_iget(struct super_blo
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		spin_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		spin_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5112,6 +5140,7 @@ static int ext4_do_update_inode(handle_t
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
 
+	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(st
 	spin_lock_init(&(ei->i_block_reservation_lock));
 	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
 	ei->cur_aio_dio = NULL;
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
 
 	return &ei->vfs_inode;
 }
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);



  parent reply	other threads:[~2009-12-11  5:30 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20091211052312.805428372@linux.site>
2009-12-11  5:28 ` [00/34] 2.6.32.1-stable review Greg KH
2009-12-11  5:23   ` [01/34] signal: Fix alternate signal stack check Greg KH
2009-12-11  5:23   ` [02/34] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects Greg KH
2009-12-11  5:23   ` [03/34] SCSI: osd_protocol.h: Add missing #include Greg KH
2009-12-11  5:23   ` [04/34] SCSI: megaraid_sas: fix 64 bit sense pointer truncation Greg KH
2009-12-11  5:23   ` [05/34] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Greg KH
2009-12-11  5:23   ` [06/34] ext4: avoid divide by zero when trying to mount a corrupted file system Greg KH
2009-12-11  5:23   ` [07/34] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails Greg KH
2009-12-11  5:23   ` [08/34] ext4: fix lock order problem in ext4_move_extents() Greg KH
2009-12-11  5:23   ` [09/34] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [10/34] ext4: plug a buffer_head leak in an error path of ext4_iget() Greg KH
2009-12-11  5:23   ` [11/34] ext4: make sure directory and symlink blocks are revoked Greg KH
2009-12-11  5:23   ` [12/34] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() Greg KH
2009-12-11  5:23   ` [13/34] ext4: journal all modifications in ext4_xattr_set_handle Greg KH
2009-12-11  5:23   ` [14/34] ext4: dont update the superblock in ext4_statfs() Greg KH
2009-12-11  5:23   ` [15/34] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero Greg KH
2009-12-11  5:23   ` [16/34] ext4: fix block validity checks so they work correctly with meta_bg Greg KH
2009-12-11  5:23   ` [17/34] ext4: avoid issuing unnecessary barriers Greg KH
2009-12-11  5:23   ` [18/34] ext4: fix error handling in ext4_ind_get_blocks() Greg KH
2009-12-11  5:23   ` [19/34] ext4: make trim/discard optional (and off by default) Greg KH
2009-12-11  5:23   ` [20/34] ext4: make "norecovery" an alias for "noload" Greg KH
2009-12-11  5:23   ` [21/34] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [22/34] ext4: initialize moved_len before calling ext4_move_extents() Greg KH
2009-12-11  5:23   ` [23/34] ext4: move_extent_per_page() cleanup Greg KH
2009-12-11  5:23   ` [24/34] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Greg KH
2009-12-11  5:23   ` [25/34] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Greg KH
2009-12-11  5:23   ` [26/34] ext4: Avoid data / filesystem corruption when write fails to copy data Greg KH
2009-12-11  5:23   ` [27/34] ext4: wait for log to commit when umounting Greg KH
2009-12-11  5:23   ` [28/34] ext4: remove blocks from inode prealloc list on failure Greg KH
2009-12-11  5:23   ` [29/34] ext4: ext4_get_reserved_space() must return bytes instead of blocks Greg KH
2009-12-11  5:23   ` [30/34] ext4: quota macros cleanup Greg KH
2009-12-11  5:23   ` [31/34] ext4: fix incorrect block reservation on quota transfer Greg KH
2009-12-11  5:23   ` Greg KH [this message]
2009-12-11  5:23   ` [33/34] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  5:23   ` [34/34] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091211052557.723287400@linux.site \
    --to=gregkh@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=jack@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable-review@kernel.org \
    --cc=stable@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.