From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
To: linux-ext4@vger.kernel.org
Cc: tytso@mit.edu, jack@suse.cz, harshads@google.com,
Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Subject: [PATCH v7 9/9] ext4: hold s_fc_lock while during fast commit
Date: Sun, 18 Aug 2024 04:03:56 +0000 [thread overview]
Message-ID: <20240818040356.241684-11-harshadshirwadkar@gmail.com> (raw)
In-Reply-To: <20240818040356.241684-1-harshadshirwadkar@gmail.com>
Leaving s_fc_lock in between during commit in ext4_fc_perform_commit()
function leaves room for subtle concurrency bugs where ext4_fc_del() may
delete an inode from the fast commit list, leaving list in an inconsistent
state. Also, this patch converts s_fc_lock to mutex type so that it can be
held when kmem_cache_* functions are called.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/ext4.h | 2 +-
fs/ext4/fast_commit.c | 91 +++++++++++++++++--------------------------
fs/ext4/super.c | 2 +-
3 files changed, 38 insertions(+), 57 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4ecb63f95..a1acd34ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1748,7 +1748,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 7525450f1..c3627efd9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -236,9 +236,9 @@ void ext4_fc_del(struct inode *inode)
if (ext4_fc_disabled(inode->i_sb))
return;
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ mutex_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ mutex_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}
@@ -266,7 +266,7 @@ void ext4_fc_del(struct inode *inode)
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
@@ -276,7 +276,7 @@ void ext4_fc_del(struct inode *inode)
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
if (fc_dentry->fcd_name.name &&
fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
@@ -306,10 +306,10 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
sbi->s_journal->j_running_transaction->t_tid : 0;
read_unlock(&sbi->s_journal->j_state_lock);
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (tid_gt(tid, sbi->s_fc_ineligible_tid))
sbi->s_fc_ineligible_tid = tid;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -349,14 +349,14 @@ static int ext4_fc_track_template(
if (!enqueue)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -414,7 +414,8 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
}
node->fcd_name.len = dentry->d_name.len;
INIT_LIST_HEAD(&node->fcd_dilist);
- spin_lock(&sbi->s_fc_lock);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -435,7 +436,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
spin_lock(&ei->i_fc_lock);
return 0;
@@ -955,15 +956,15 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
struct ext4_inode_info *ei;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -976,19 +977,19 @@ static int ext4_fc_wait_inode_data_all(journal_t *journal)
struct ext4_inode_info *pos, *n;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
if (!ext4_test_inode_state(&pos->vfs_inode,
EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
ret = jbd2_wait_inode_data(journal, pos->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return 0;
}
@@ -1010,26 +1011,22 @@ __releases(&sbi->s_fc_lock)
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
- spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
- spin_lock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
continue;
}
/*
* With fcd_dilist we need not loop in sbi->s_fc_q to get the
- * corresponding inode pointer
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
*/
- WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
- spin_unlock(&sbi->s_fc_lock);
-
/*
* We first write the inode and then the create dirent. This
* allows the recovery code to create an unnamed inode first
@@ -1039,23 +1036,14 @@ __releases(&sbi->s_fc_lock)
*/
ret = ext4_fc_write_inode(inode, crc);
if (ret)
- goto lock_and_exit;
-
+ return ret;
ret = ext4_fc_write_inode_data(inode, crc);
if (ret)
- goto lock_and_exit;
-
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
-
- spin_lock(&sbi->s_fc_lock);
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
}
return 0;
-lock_and_exit:
- spin_lock(&sbi->s_fc_lock);
- return ret;
}
static int ext4_fc_perform_commit(journal_t *journal)
@@ -1074,12 +1062,12 @@ static int ext4_fc_perform_commit(journal_t *journal)
* and then lock the journal.
*/
jbd2_journal_lock_updates(journal);
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
}
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
jbd2_journal_unlock_updates(journal);
ret = ext4_fc_submit_inode_data_all(journal);
@@ -1113,19 +1101,16 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
- if (ret) {
- spin_unlock(&sbi->s_fc_lock);
+ if (ret)
goto out;
- }
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
ret = ext4_fc_write_inode_data(inode, &crc);
if (ret)
goto out;
@@ -1144,13 +1129,11 @@ static int ext4_fc_perform_commit(journal_t *journal)
#else
wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
ret = ext4_fc_write_tail(sb, crc);
out:
+ mutex_unlock(&sbi->s_fc_lock);
blk_finish_plug(&plug);
return ret;
}
@@ -1291,7 +1274,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
@@ -1318,13 +1301,11 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
fcd_list);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
- spin_unlock(&sbi->s_fc_lock);
if (fc_dentry->fcd_name.name &&
fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
kfree(fc_dentry->fcd_name.name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- spin_lock(&sbi->s_fc_lock);
}
list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
@@ -1339,7 +1320,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
trace_ext4_fc_stats(sb);
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4f38a34b0..bef9fd128 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4436,7 +4436,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
+ mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
--
2.46.0.184.g6999bdac58-goog
next prev parent reply other threads:[~2024-08-18 4:04 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-18 4:03 [PATCH] ext4: convert i_fc_lock to spinlock Harshad Shirwadkar
2024-08-18 4:03 ` [PATCH v7 0/9] Ext4 Fast Commit Performance Patchset Harshad Shirwadkar
2024-08-18 4:49 ` harshad shirwadkar
2024-08-18 4:03 ` [PATCH v7 1/9] ext4: convert i_fc_lock to spinlock Harshad Shirwadkar
2024-08-18 4:03 ` [PATCH v7 2/9] ext4: for committing inode, make ext4_fc_track_inode wait Harshad Shirwadkar
2024-12-12 22:00 ` Jan Kara
2024-12-13 15:10 ` Jan Kara
2025-04-14 16:57 ` harshad shirwadkar
2024-08-18 4:03 ` [PATCH v7 3/9] ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr Harshad Shirwadkar
2024-12-12 21:57 ` Jan Kara
2024-08-18 4:03 ` [PATCH v7 4/9] ext4: rework fast commit commit path Harshad Shirwadkar
2024-12-13 15:12 ` Jan Kara
2024-08-18 4:03 ` [PATCH v7 5/9] ext4: drop i_fc_updates from inode fc info Harshad Shirwadkar
2024-08-18 4:03 ` [PATCH v7 6/9] ext4: update code documentation Harshad Shirwadkar
2024-12-13 15:15 ` Jan Kara
2024-08-18 4:03 ` [PATCH v7 7/9] ext4: temporarily elevate commit thread priority Harshad Shirwadkar
2024-08-18 4:03 ` [PATCH v7 8/9] ext4: make fast commit ineligible on ext4_reserve_inode_write failure Harshad Shirwadkar
2024-12-16 10:40 ` Jan Kara
2024-08-18 4:03 ` Harshad Shirwadkar [this message]
2024-12-16 10:50 ` [PATCH v7 9/9] ext4: hold s_fc_lock while during fast commit Jan Kara
2025-01-13 14:16 ` Baokun Li
2025-04-14 16:59 ` harshad shirwadkar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240818040356.241684-11-harshadshirwadkar@gmail.com \
--to=harshadshirwadkar@gmail.com \
--cc=harshads@google.com \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox