All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
	akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
	Jan Kara <jack@suse.cz>, "Theodore Tso" <tytso@mit.edu>,
	Greg Kroah-Hartman <gregkh@suse.de>
Subject: [84/90] ext4: Wait for proper transaction commit on fsync
Date: Thu, 10 Dec 2009 20:26:02 -0800	[thread overview]
Message-ID: <20091211042819.790485160@linux.site> (raw)
In-Reply-To: <20091211043502.GA17916@kroah.com>

[-- Attachment #1: 0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch --]
[-- Type: text/plain, Size: 7949 bytes --]

2.6.31-stable review patch.  If anyone has any objections, please let us know.

------------------
(cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645)

We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/ext4.h      |    7 +++++++
 fs/ext4/ext4_jbd2.h |   13 +++++++++++++
 fs/ext4/extents.c   |   14 ++++++++++++--
 fs/ext4/fsync.c     |   46 +++++++++++++++++-----------------------------
 fs/ext4/inode.c     |   29 +++++++++++++++++++++++++++++
 fs/ext4/super.c     |    2 ++
 fs/jbd2/journal.c   |    1 +
 7 files changed, 81 insertions(+), 31 deletions(-)

--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -700,6 +700,13 @@ struct ext4_inode_info {
 	struct list_head i_aio_dio_complete_list;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
 };
 
 /*
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h
 	return 0;
 }
 
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3041,6 +3041,8 @@ ext4_ext_handle_uninitialized_extents(ha
 	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
 		ret = ext4_convert_unwritten_extents_dio(handle, inode,
 							path);
+		if (ret >= 0)
+			ext4_update_inode_fsync_trans(handle, inode, 1);
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3068,6 +3070,8 @@ ext4_ext_handle_uninitialized_extents(ha
 	ret = ext4_ext_convert_to_initialized(handle, inode,
 						path, iblock,
 						max_blocks);
+	if (ret >= 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3306,10 +3310,16 @@ int ext4_ext_get_blocks(handle_t *handle
 	allocated = ext4_ext_get_actual_len(&newex);
 	set_buffer_new(bh_result);
 
-	/* Cache only when it is _not_ an uninitialized extent */
-	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an uninitialized extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	} else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int err, ret = 0;
+	int ret;
+	tid_t commit_tid;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file(file, dentry, datasync);
 
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return 0;
+
 	ret = flush_aio_dio_completed_IO(inode);
 	if (ret < 0)
 		return ret;
+
+	if (!journal)
+		return simple_fsync(file, dentry, datasync);
+
 	/*
-	 * data=writeback:
+	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  sync_inode() will sync the metadata
-	 *
-	 * data=ordered:
-	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st
 	if (ext4_should_journal_data(inode))
 		return ext4_force_commit(inode->i_sb);
 
-	if (!journal)
-		ret = sync_mapping_buffers(inode->i_mapping);
-
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
-
-	/*
-	 * The VFS has written the file data.  If the inode is unaltered
-	 * then we need not start a commit.
-	 */
-	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* sys_fsync did this */
-		};
-		err = sync_inode(inode, &wbc);
-		if (ret == 0)
-			ret = err;
-	}
-out:
-	if (journal && (journal->j_flags & JBD2_BARRIER))
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (jbd2_log_start_commit(journal, commit_tid))
+		jbd2_log_wait_commit(journal, commit_tid);
+	else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1026,6 +1026,8 @@ static int ext4_ind_get_blocks(handle_t
 		goto cleanup;
 
 	set_buffer_new(bh_result);
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 	if (count > blocks_to_boundary)
@@ -4784,6 +4786,7 @@ struct inode *ext4_iget(struct super_blo
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
 	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
 
@@ -4848,6 +4851,31 @@ struct inode *ext4_iget(struct super_blo
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		spin_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		spin_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5102,6 +5130,7 @@ static int ext4_do_update_inode(handle_t
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
 
+	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
 	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -713,6 +713,8 @@ static struct inode *ext4_alloc_inode(st
 	spin_lock_init(&(ei->i_block_reservation_lock));
 	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
 	ei->cur_aio_dio = NULL;
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
 
 	return &ei->vfs_inode;
 }
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);



  parent reply	other threads:[~2009-12-11  4:37 UTC|newest]

Thread overview: 91+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20091211042438.970725457@linux.site>
2009-12-11  4:35 ` [00/90] 2.6.31.8-stable review Greg KH
2009-12-11  4:24   ` [01/90] ext4: Fix memory leak fix when mounting an ext4 filesystem Greg KH
2009-12-11  4:24   ` [02/90] ext4: Avoid null pointer dereference when decoding EROFS w/o a journal Greg KH
2009-12-11  4:24   ` [03/90] jbd2: Fail to load a journal if it is too short Greg KH
2009-12-11  4:24   ` [04/90] jbd2: round commit timer up to avoid uncommitted transaction Greg KH
2009-12-11  4:24   ` [05/90] ext4: fix journal ref count in move_extent_par_page Greg KH
2009-12-11  4:24   ` [06/90] ext4: Fix bugs in mballocs stream allocation mode Greg KH
2009-12-11  4:24   ` [07/90] ext4: Avoid group preallocation for closed files Greg KH
2009-12-11  4:24   ` [08/90] jbd2: Annotate transaction start also for jbd2_journal_restart() Greg KH
2009-12-11  4:24   ` [09/90] ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks() Greg KH
2009-12-11  4:24   ` [10/90] ext4: reject too-large filesystems on 32-bit kernels Greg KH
2009-12-11  4:24   ` [11/90] ext4: Add feature set check helper for mount & remount paths Greg KH
2009-12-11  4:24   ` [12/90] ext4: Add missing unlock_new_inode() call in extent migration code Greg KH
2009-12-11  4:24   ` [13/90] ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories Greg KH
2009-12-11  4:24   ` [14/90] ext4: Limit number of links that can be created by ext4_link() Greg KH
2009-12-11  4:24   ` [15/90] ext4: Restore wbc->range_start in ext4_da_writepages() Greg KH
2009-12-11  4:24   ` [16/90] ext4: fix cache flush in ext4_sync_file Greg KH
2009-12-11  4:24   ` [17/90] ext4: Fix wrong comparisons in mext_check_arguments() Greg KH
2009-12-11  4:24   ` [18/90] ext4: Remove unneeded BUG_ON() in ext4_move_extents() Greg KH
2009-12-11  4:24   ` [19/90] ext4: Return exchanged blocks count to user space in failure Greg KH
2009-12-11  4:24   ` [20/90] ext4: Take page lock before looking at attached buffer_heads flags Greg KH
2009-12-11  4:24   ` [21/90] ext4: print more sysadmin-friendly message in check_block_validity() Greg KH
2009-12-11  4:25   ` [22/90] ext4: Use bforget() in no journal mode for ext4_journal_{forget,revoke}() Greg KH
2009-12-11  4:25   ` [23/90] ext4: Assure that metadata blocks are written during fsync in no journal mode Greg KH
2009-12-11  4:25   ` [24/90] ext4: Make non-journal fsync work properly Greg KH
2009-12-11  4:25   ` [25/90] ext4: move ext4_mb_init_group() function earlier in the mballoc.c Greg KH
2009-12-11  4:25   ` [26/90] ext4: check for need init flag in ext4_mb_load_buddy Greg KH
2009-12-11  4:25   ` [27/90] ext4: Dont update superblock write time when filesystem is read-only Greg KH
2009-12-11  4:25   ` [28/90] ext4: Always set dx_nodes fake_dirent explicitly Greg KH
2009-12-11  4:25   ` [29/90] ext4: Fix initalization of s_flex_groups Greg KH
2009-12-11  4:25   ` [30/90] ext4: Fix include/trace/events/ext4.h to work with Systemtap Greg KH
2009-12-11  4:25   ` [31/90] ext4: Fix small typo for move_extent_per_page() Greg KH
2009-12-11  4:25   ` [32/90] ext4: Replace get_ext_path macro with an inline funciton Greg KH
2009-12-11  4:25   ` [33/90] ext4: Replace BUG_ON() with ext4_error() in move_extents.c Greg KH
2009-12-11  4:25   ` [34/90] ext4: Add null extent check to ext_get_path Greg KH
2009-12-11  4:25   ` [35/90] ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  4:25   ` [36/90] ext4: limit block allocations for indirect-block files to < 2^32 Greg KH
2009-12-11  4:25   ` [37/90] ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags Greg KH
2009-12-11  4:25   ` [38/90] ext4: Fix the alloc on close after a truncate hueristic Greg KH
2009-12-11  4:25   ` [39/90] ext4: Fix hueristic which avoids group preallocation for closed files Greg KH
2009-12-11  4:25   ` [40/90] ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks Greg KH
2009-12-11  4:25   ` [41/90] ext4: release reserved quota when block reservation for delalloc retry Greg KH
2009-12-11  4:25   ` [42/90] ext4: Split uninitialized extents for direct I/O Greg KH
2009-12-11  4:25   ` [43/90] ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O Greg KH
2009-12-11  4:25   ` [44/90] ext4: async direct IO for holes and fallocate support Greg KH
2009-12-11  4:25   ` [45/90] ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first Greg KH
2009-12-11  4:25   ` [46/90] ext4: Avoid updating the inode table bh twice in no journal mode Greg KH
2009-12-11  4:25   ` [47/90] ext4: Make sure ext4_dirty_inode() updates the inode " Greg KH
2009-12-11  4:25   ` [48/90] ext4: Handle nested ext4_journal_start/stop calls without a journal Greg KH
2009-12-11  4:25   ` [49/90] ext4: Fix time encoding with extra epoch bits Greg KH
2009-12-11  4:25   ` [50/90] ext4: fix a BUG_ON crash by checking that page has buffers attached to it Greg KH
2009-12-11  4:25   ` [51/90] ext4: retry failed direct IO allocations Greg KH
2009-12-11  4:25   ` [52/90] ext4: discard preallocation when restarting a transaction during truncate Greg KH
2009-12-11  4:25   ` [53/90] ext4: fix ext4_ext_direct_IO()s return value after converting uninit extents Greg KH
2009-12-11  4:25   ` [54/90] ext4: skip conversion of uninit extents after direct IO if there isnt any Greg KH
2009-12-11  4:25   ` [55/90] ext4: code clean up for dio fallocate handling Greg KH
2009-12-11  4:25   ` [56/90] ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O Greg KH
2009-12-11  4:25   ` [57/90] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Greg KH
2009-12-11  4:25   ` [58/90] ext4: avoid divide by zero when trying to mount a corrupted file system Greg KH
2009-12-11  4:25   ` [59/90] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails Greg KH
2009-12-11  4:25   ` [60/90] ext4: fix lock order problem in ext4_move_extents() Greg KH
2009-12-11  4:25   ` [61/90] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  4:25   ` [62/90] ext4: plug a buffer_head leak in an error path of ext4_iget() Greg KH
2009-12-11  4:25   ` [63/90] ext4: make sure directory and symlink blocks are revoked Greg KH
2009-12-11  4:25   ` [64/90] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() Greg KH
2009-12-11  4:25   ` [65/90] ext4: journal all modifications in ext4_xattr_set_handle Greg KH
2009-12-11  4:25   ` [66/90] ext4: dont update the superblock in ext4_statfs() Greg KH
2009-12-11  4:25   ` [67/90] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero Greg KH
2009-12-11  4:25   ` [68/90] ext4: fix block validity checks so they work correctly with meta_bg Greg KH
2009-12-11  4:25   ` [69/90] ext4: avoid issuing unnecessary barriers Greg KH
2009-12-11  4:25   ` [70/90] ext4: fix error handling in ext4_ind_get_blocks() Greg KH
2009-12-11  4:25   ` [71/90] ext4: make trim/discard optional (and off by default) Greg KH
2009-12-11  4:25   ` [72/90] ext4: make "norecovery" an alias for "noload" Greg KH
2009-12-11  4:25   ` [73/90] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  4:25   ` [74/90] ext4: initialize moved_len before calling ext4_move_extents() Greg KH
2009-12-11  4:25   ` [75/90] ext4: move_extent_per_page() cleanup Greg KH
2009-12-11  4:25   ` [76/90] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Greg KH
2009-12-11  4:25   ` [77/90] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Greg KH
2009-12-11  4:25   ` [78/90] ext4: Avoid data / filesystem corruption when write fails to copy data Greg KH
2009-12-11  4:25   ` [79/90] ext4: wait for log to commit when umounting Greg KH
2009-12-11  4:25   ` [80/90] ext4: remove blocks from inode prealloc list on failure Greg KH
2009-12-11  4:25   ` [81/90] ext4: ext4_get_reserved_space() must return bytes instead of blocks Greg KH
2009-12-11  4:26   ` [82/90] ext4: quota macros cleanup Greg KH
2009-12-11  4:26   ` [83/90] ext4: fix incorrect block reservation on quota transfer Greg KH
2009-12-11  4:26   ` Greg KH [this message]
2009-12-11  4:26   ` [85/90] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11  4:26   ` [86/90] SCSI: megaraid_sas: fix 64 bit sense pointer truncation Greg KH
2009-12-11  4:26   ` [87/90] SCSI: osd_protocol.h: Add missing #include Greg KH
2009-12-11  4:26   ` [88/90] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects Greg KH
2009-12-11  4:26   ` [89/90] signal: Fix alternate signal stack check Greg KH
2009-12-11  4:26   ` [90/90] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091211042819.790485160@linux.site \
    --to=gregkh@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=jack@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable-review@kernel.org \
    --cc=stable@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.