From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
Jan Kara <jack@suse.cz>, "Theodore Tso" <tytso@mit.edu>,
Greg Kroah-Hartman <gregkh@suse.de>
Subject: [84/90] ext4: Wait for proper transaction commit on fsync
Date: Thu, 10 Dec 2009 20:26:02 -0800 [thread overview]
Message-ID: <20091211042819.790485160@linux.site> (raw)
In-Reply-To: <20091211043502.GA17916@kroah.com>
[-- Attachment #1: 0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch --]
[-- Type: text/plain, Size: 7949 bytes --]
2.6.31-stable review patch. If anyone has any objections, please let us know.
------------------
(cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645)
We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/ext4/ext4.h | 7 +++++++
fs/ext4/ext4_jbd2.h | 13 +++++++++++++
fs/ext4/extents.c | 14 ++++++++++++--
fs/ext4/fsync.c | 46 +++++++++++++++++-----------------------------
fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++
fs/ext4/super.c | 2 ++
fs/jbd2/journal.c | 1 +
7 files changed, 81 insertions(+), 31 deletions(-)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -700,6 +700,13 @@ struct ext4_inode_info {
struct list_head i_aio_dio_complete_list;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h
return 0;
}
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3041,6 +3041,8 @@ ext4_ext_handle_uninitialized_extents(ha
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
ret = ext4_convert_unwritten_extents_dio(handle, inode,
path);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
goto out2;
}
/* buffered IO case */
@@ -3068,6 +3070,8 @@ ext4_ext_handle_uninitialized_extents(ha
ret = ext4_ext_convert_to_initialized(handle, inode,
path, iblock,
max_blocks);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
out:
if (ret <= 0) {
err = ret;
@@ -3306,10 +3310,16 @@ int ext4_ext_get_blocks(handle_t *handle
allocated = ext4_ext_get_actual_len(&newex);
set_buffer_new(bh_result);
- /* Cache only when it is _not_ an uninitialized extent */
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ } else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > max_blocks)
allocated = max_blocks;
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int err, ret = 0;
+ int ret;
+ tid_t commit_tid;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync);
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
ret = flush_aio_dio_completed_IO(inode);
if (ret < 0)
return ret;
+
+ if (!journal)
+ return simple_fsync(file, dentry, datasync);
+
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st
if (ext4_should_journal_data(inode))
return ext4_force_commit(inode->i_sb);
- if (!journal)
- ret = sync_mapping_buffers(inode->i_mapping);
-
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
-
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- err = sync_inode(inode, &wbc);
- if (ret == 0)
- ret = err;
- }
-out:
- if (journal && (journal->j_flags & JBD2_BARRIER))
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (jbd2_log_start_commit(journal, commit_tid))
+ jbd2_log_wait_commit(journal, commit_tid);
+ else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1026,6 +1026,8 @@ static int ext4_ind_get_blocks(handle_t
goto cleanup;
set_buffer_new(bh_result);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -4784,6 +4786,7 @@ struct inode *ext4_iget(struct super_blo
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
@@ -4848,6 +4851,31 @@ struct inode *ext4_iget(struct super_blo
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5102,6 +5130,7 @@ static int ext4_do_update_inode(handle_t
err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -713,6 +713,8 @@ static struct inode *ext4_alloc_inode(st
spin_lock_init(&(ei->i_block_reservation_lock));
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
ei->cur_aio_dio = NULL;
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
return &ei->vfs_inode;
}
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
next prev parent reply other threads:[~2009-12-11 4:37 UTC|newest]
Thread overview: 91+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20091211042438.970725457@linux.site>
2009-12-11 4:35 ` [00/90] 2.6.31.8-stable review Greg KH
2009-12-11 4:24 ` [01/90] ext4: Fix memory leak fix when mounting an ext4 filesystem Greg KH
2009-12-11 4:24 ` [02/90] ext4: Avoid null pointer dereference when decoding EROFS w/o a journal Greg KH
2009-12-11 4:24 ` [03/90] jbd2: Fail to load a journal if it is too short Greg KH
2009-12-11 4:24 ` [04/90] jbd2: round commit timer up to avoid uncommitted transaction Greg KH
2009-12-11 4:24 ` [05/90] ext4: fix journal ref count in move_extent_par_page Greg KH
2009-12-11 4:24 ` [06/90] ext4: Fix bugs in mballocs stream allocation mode Greg KH
2009-12-11 4:24 ` [07/90] ext4: Avoid group preallocation for closed files Greg KH
2009-12-11 4:24 ` [08/90] jbd2: Annotate transaction start also for jbd2_journal_restart() Greg KH
2009-12-11 4:24 ` [09/90] ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks() Greg KH
2009-12-11 4:24 ` [10/90] ext4: reject too-large filesystems on 32-bit kernels Greg KH
2009-12-11 4:24 ` [11/90] ext4: Add feature set check helper for mount & remount paths Greg KH
2009-12-11 4:24 ` [12/90] ext4: Add missing unlock_new_inode() call in extent migration code Greg KH
2009-12-11 4:24 ` [13/90] ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories Greg KH
2009-12-11 4:24 ` [14/90] ext4: Limit number of links that can be created by ext4_link() Greg KH
2009-12-11 4:24 ` [15/90] ext4: Restore wbc->range_start in ext4_da_writepages() Greg KH
2009-12-11 4:24 ` [16/90] ext4: fix cache flush in ext4_sync_file Greg KH
2009-12-11 4:24 ` [17/90] ext4: Fix wrong comparisons in mext_check_arguments() Greg KH
2009-12-11 4:24 ` [18/90] ext4: Remove unneeded BUG_ON() in ext4_move_extents() Greg KH
2009-12-11 4:24 ` [19/90] ext4: Return exchanged blocks count to user space in failure Greg KH
2009-12-11 4:24 ` [20/90] ext4: Take page lock before looking at attached buffer_heads flags Greg KH
2009-12-11 4:24 ` [21/90] ext4: print more sysadmin-friendly message in check_block_validity() Greg KH
2009-12-11 4:25 ` [22/90] ext4: Use bforget() in no journal mode for ext4_journal_{forget,revoke}() Greg KH
2009-12-11 4:25 ` [23/90] ext4: Assure that metadata blocks are written during fsync in no journal mode Greg KH
2009-12-11 4:25 ` [24/90] ext4: Make non-journal fsync work properly Greg KH
2009-12-11 4:25 ` [25/90] ext4: move ext4_mb_init_group() function earlier in the mballoc.c Greg KH
2009-12-11 4:25 ` [26/90] ext4: check for need init flag in ext4_mb_load_buddy Greg KH
2009-12-11 4:25 ` [27/90] ext4: Dont update superblock write time when filesystem is read-only Greg KH
2009-12-11 4:25 ` [28/90] ext4: Always set dx_nodes fake_dirent explicitly Greg KH
2009-12-11 4:25 ` [29/90] ext4: Fix initalization of s_flex_groups Greg KH
2009-12-11 4:25 ` [30/90] ext4: Fix include/trace/events/ext4.h to work with Systemtap Greg KH
2009-12-11 4:25 ` [31/90] ext4: Fix small typo for move_extent_per_page() Greg KH
2009-12-11 4:25 ` [32/90] ext4: Replace get_ext_path macro with an inline funciton Greg KH
2009-12-11 4:25 ` [33/90] ext4: Replace BUG_ON() with ext4_error() in move_extents.c Greg KH
2009-12-11 4:25 ` [34/90] ext4: Add null extent check to ext_get_path Greg KH
2009-12-11 4:25 ` [35/90] ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11 4:25 ` [36/90] ext4: limit block allocations for indirect-block files to < 2^32 Greg KH
2009-12-11 4:25 ` [37/90] ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags Greg KH
2009-12-11 4:25 ` [38/90] ext4: Fix the alloc on close after a truncate hueristic Greg KH
2009-12-11 4:25 ` [39/90] ext4: Fix hueristic which avoids group preallocation for closed files Greg KH
2009-12-11 4:25 ` [40/90] ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks Greg KH
2009-12-11 4:25 ` [41/90] ext4: release reserved quota when block reservation for delalloc retry Greg KH
2009-12-11 4:25 ` [42/90] ext4: Split uninitialized extents for direct I/O Greg KH
2009-12-11 4:25 ` [43/90] ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O Greg KH
2009-12-11 4:25 ` [44/90] ext4: async direct IO for holes and fallocate support Greg KH
2009-12-11 4:25 ` [45/90] ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first Greg KH
2009-12-11 4:25 ` [46/90] ext4: Avoid updating the inode table bh twice in no journal mode Greg KH
2009-12-11 4:25 ` [47/90] ext4: Make sure ext4_dirty_inode() updates the inode " Greg KH
2009-12-11 4:25 ` [48/90] ext4: Handle nested ext4_journal_start/stop calls without a journal Greg KH
2009-12-11 4:25 ` [49/90] ext4: Fix time encoding with extra epoch bits Greg KH
2009-12-11 4:25 ` [50/90] ext4: fix a BUG_ON crash by checking that page has buffers attached to it Greg KH
2009-12-11 4:25 ` [51/90] ext4: retry failed direct IO allocations Greg KH
2009-12-11 4:25 ` [52/90] ext4: discard preallocation when restarting a transaction during truncate Greg KH
2009-12-11 4:25 ` [53/90] ext4: fix ext4_ext_direct_IO()s return value after converting uninit extents Greg KH
2009-12-11 4:25 ` [54/90] ext4: skip conversion of uninit extents after direct IO if there isnt any Greg KH
2009-12-11 4:25 ` [55/90] ext4: code clean up for dio fallocate handling Greg KH
2009-12-11 4:25 ` [56/90] ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O Greg KH
2009-12-11 4:25 ` [57/90] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Greg KH
2009-12-11 4:25 ` [58/90] ext4: avoid divide by zero when trying to mount a corrupted file system Greg KH
2009-12-11 4:25 ` [59/90] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails Greg KH
2009-12-11 4:25 ` [60/90] ext4: fix lock order problem in ext4_move_extents() Greg KH
2009-12-11 4:25 ` [61/90] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11 4:25 ` [62/90] ext4: plug a buffer_head leak in an error path of ext4_iget() Greg KH
2009-12-11 4:25 ` [63/90] ext4: make sure directory and symlink blocks are revoked Greg KH
2009-12-11 4:25 ` [64/90] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() Greg KH
2009-12-11 4:25 ` [65/90] ext4: journal all modifications in ext4_xattr_set_handle Greg KH
2009-12-11 4:25 ` [66/90] ext4: dont update the superblock in ext4_statfs() Greg KH
2009-12-11 4:25 ` [67/90] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero Greg KH
2009-12-11 4:25 ` [68/90] ext4: fix block validity checks so they work correctly with meta_bg Greg KH
2009-12-11 4:25 ` [69/90] ext4: avoid issuing unnecessary barriers Greg KH
2009-12-11 4:25 ` [70/90] ext4: fix error handling in ext4_ind_get_blocks() Greg KH
2009-12-11 4:25 ` [71/90] ext4: make trim/discard optional (and off by default) Greg KH
2009-12-11 4:25 ` [72/90] ext4: make "norecovery" an alias for "noload" Greg KH
2009-12-11 4:25 ` [73/90] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT Greg KH
2009-12-11 4:25 ` [74/90] ext4: initialize moved_len before calling ext4_move_extents() Greg KH
2009-12-11 4:25 ` [75/90] ext4: move_extent_per_page() cleanup Greg KH
2009-12-11 4:25 ` [76/90] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() Greg KH
2009-12-11 4:25 ` [77/90] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Greg KH
2009-12-11 4:25 ` [78/90] ext4: Avoid data / filesystem corruption when write fails to copy data Greg KH
2009-12-11 4:25 ` [79/90] ext4: wait for log to commit when umounting Greg KH
2009-12-11 4:25 ` [80/90] ext4: remove blocks from inode prealloc list on failure Greg KH
2009-12-11 4:25 ` [81/90] ext4: ext4_get_reserved_space() must return bytes instead of blocks Greg KH
2009-12-11 4:26 ` [82/90] ext4: quota macros cleanup Greg KH
2009-12-11 4:26 ` [83/90] ext4: fix incorrect block reservation on quota transfer Greg KH
2009-12-11 4:26 ` Greg KH [this message]
2009-12-11 4:26 ` [85/90] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT Greg KH
2009-12-11 4:26 ` [86/90] SCSI: megaraid_sas: fix 64 bit sense pointer truncation Greg KH
2009-12-11 4:26 ` [87/90] SCSI: osd_protocol.h: Add missing #include Greg KH
2009-12-11 4:26 ` [88/90] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects Greg KH
2009-12-11 4:26 ` [89/90] signal: Fix alternate signal stack check Greg KH
2009-12-11 4:26 ` [90/90] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Greg KH
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20091211042819.790485160@linux.site \
--to=gregkh@suse.de \
--cc=akpm@linux-foundation.org \
--cc=alan@lxorguk.ukuu.org.uk \
--cc=jack@suse.cz \
--cc=linux-kernel@vger.kernel.org \
--cc=stable-review@kernel.org \
--cc=stable@kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox