From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org,
Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>,
Al Viro <viro@zeniv.linux.org.uk>,
Andrew Morton <akpm@linux-foundation.org>,
Linus Torvalds <torvalds@linux-foundation.org>
Subject: [PATCH 3.10 08/17] nilfs2: fix deadlock of segment constructor over I_SYNC flag
Date: Mon, 9 Feb 2015 16:33:35 +0800 [thread overview]
Message-ID: <20150209083040.122014201@linuxfoundation.org> (raw)
In-Reply-To: <20150209083039.240170510@linuxfoundation.org>
3.10-stable review patch. If anyone has any objections, please let me know.
------------------
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 upstream.
Nilfs2 eventually hangs in a stress test with fsstress program. This
issue was caused by the following deadlock over I_SYNC flag between
nilfs_segctor_thread() and writeback_sb_inodes():
nilfs_segctor_thread()
nilfs_segctor_thread_construct()
nilfs_segctor_unlock()
nilfs_dispose_list()
iput()
iput_final()
evict()
inode_wait_for_writeback() * wait for I_SYNC flag
writeback_sb_inodes()
* set I_SYNC flag on inode->i_state
__writeback_single_inode()
do_writepages()
nilfs_writepages()
nilfs_construct_dsync_segment()
nilfs_segctor_sync()
* wait for completion of segment constructor
inode_sync_complete()
* clear I_SYNC flag after __writeback_single_inode() completed
writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state. do_writepages() in turn calls
nilfs_writepages(), which can run segment constructor and wait for its
completion. On the other hand, segment constructor calls iput(), which
can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().
Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has a
non-zero count. We can prevent evict() from being called in iput() by
implementing sop->drop_inode(), but it's not preferable to leave inodes
with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation. So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
fs/nilfs2/nilfs.h | 2 --
fs/nilfs2/segment.c | 44 +++++++++++++++++++++++++++++++++++++++-----
fs/nilfs2/segment.h | 5 +++++
3 files changed, 44 insertions(+), 7 deletions(-)
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
*/
struct nilfs_transaction_info {
u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
one of other filesystems has a bug. */
unsigned short ti_flags;
unsigned short ti_count;
- struct list_head ti_garbage;
};
/* ti_magic */
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struc
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
- INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(str
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
- if (!list_empty(&ti->ti_garbage))
- nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct th
}
}
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+ struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+ sc_iput_work);
+ struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
struct nilfs_root *root)
{
@@ -1899,8 +1905,8 @@ static int nilfs_segctor_collect_dirty_f
static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+ int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1911,9 +1917,24 @@ static void nilfs_segctor_drop_written_f
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_del_init(&ii->i_dirty);
+ if (!ii->vfs_inode.i_nlink) {
+ /*
+ * Defer calling iput() to avoid a deadlock
+ * over I_SYNC flag for inodes with i_nlink == 0
+ */
+ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+ defer_iput = true;
+ } else {
+ spin_unlock(&nilfs->ns_inode_lock);
+ iput(&ii->vfs_inode);
+ spin_lock(&nilfs->ns_inode_lock);
+ }
}
spin_unlock(&nilfs->ns_inode_lock);
+
+ if (defer_iput)
+ schedule_work(&sci->sc_iput_work);
}
/*
@@ -2580,6 +2601,8 @@ static struct nilfs_sc_info *nilfs_segct
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
+ INIT_LIST_HEAD(&sci->sc_iput_queue);
+ INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2606,6 +2629,8 @@ static void nilfs_segctor_write_out(stru
ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
nilfs_transaction_unlock(sci->sc_super);
+ flush_work(&sci->sc_iput_work);
+
} while (ret && retrycount-- > 0);
}
@@ -2630,6 +2655,9 @@ static void nilfs_segctor_destroy(struct
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ if (flush_work(&sci->sc_iput_work))
+ flag = true;
+
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
@@ -2639,6 +2667,12 @@ static void nilfs_segctor_destroy(struct
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
+ if (!list_empty(&sci->sc_iput_queue)) {
+ nilfs_warning(sci->sc_super, __func__,
+ "iput queue is not empty\n");
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+ }
+
WARN_ON(!list_empty(&sci->sc_segbufs));
WARN_ON(!list_empty(&sci->sc_write_logs));
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
+ struct list_head sc_iput_queue;
+ struct work_struct sc_iput_work;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
next prev parent reply other threads:[~2015-02-09 8:35 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-02-09 8:33 [PATCH 3.10 00/17] 3.10.69-stable review Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 01/17] gpio: sysfs: fix memory leak in gpiod_export_link Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 02/17] gpio: sysfs: fix memory leak in gpiod_sysfs_set_active_low Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 03/17] PCI: Add NEC variants to Stratus ftServer PCIe DMI check Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 04/17] MIPS: IRQ: Fix disable_irq on CPU IRQs Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 05/17] MIPS: Fix kernel lockup or crash after CPU offline/online Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 06/17] mm: pagewalk: call pte_hole() for VM_PFNMAP during walk_page_range Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 07/17] lib/checksum.c: fix carry in csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09 8:33 ` Greg Kroah-Hartman [this message]
2015-02-09 8:33 ` [PATCH 3.10 09/17] arm64: Fix up /proc/cpuinfo Greg Kroah-Hartman
2015-02-09 8:33 ` Greg Kroah-Hartman
2015-02-09 8:33 ` Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 10/17] ext4: prevent bugon on race between write/fcntl Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 11/17] lib/checksum.c: fix build for generic csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 12/17] ASoC: atmel_ssc_dai: fix start event for I2S mode Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 13/17] ASoC: sgtl5000: add delay before first I2C access Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 14/17] ALSA: ak411x: Fix stall in work callback Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 15/17] smpboot: Add missing get_online_cpus() in smpboot_register_percpu_thread() Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 16/17] kvm: vmx: handle invvpid vm exit gracefully Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.10 17/17] x86,kvm,vmx: Preserve CR4 across VM entry Greg Kroah-Hartman
2015-02-09 16:37 ` [PATCH 3.10 00/17] 3.10.69-stable review Guenter Roeck
2015-02-09 21:38 ` Greg Kroah-Hartman
2015-02-09 18:21 ` Shuah Khan
2015-02-09 21:37 ` Greg Kroah-Hartman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150209083040.122014201@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=akpm@linux-foundation.org \
--cc=konishi.ryusuke@lab.ntt.co.jp \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.