From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org,
Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>,
Al Viro <viro@zeniv.linux.org.uk>,
Andrew Morton <akpm@linux-foundation.org>,
Linus Torvalds <torvalds@linux-foundation.org>
Subject: [PATCH 3.18 22/39] nilfs2: fix deadlock of segment constructor over I_SYNC flag
Date: Mon, 9 Feb 2015 16:34:05 +0800 [thread overview]
Message-ID: <20150209083329.832240939@linuxfoundation.org> (raw)
In-Reply-To: <20150209083328.753647350@linuxfoundation.org>
3.18-stable review patch. If anyone has any objections, please let me know.
------------------
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 upstream.
Nilfs2 eventually hangs in a stress test with fsstress program. This
issue was caused by the following deadlock over I_SYNC flag between
nilfs_segctor_thread() and writeback_sb_inodes():
nilfs_segctor_thread()
nilfs_segctor_thread_construct()
nilfs_segctor_unlock()
nilfs_dispose_list()
iput()
iput_final()
evict()
inode_wait_for_writeback() * wait for I_SYNC flag
writeback_sb_inodes()
* set I_SYNC flag on inode->i_state
__writeback_single_inode()
do_writepages()
nilfs_writepages()
nilfs_construct_dsync_segment()
nilfs_segctor_sync()
* wait for completion of segment constructor
inode_sync_complete()
* clear I_SYNC flag after __writeback_single_inode() completed
writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state. do_writepages() in turn calls
nilfs_writepages(), which can run segment constructor and wait for its
completion. On the other hand, segment constructor calls iput(), which
can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().
Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has a
non-zero count. We can prevent evict() from being called in iput() by
implementing sop->drop_inode(), but it's not preferable to leave inodes
with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation. So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
fs/nilfs2/nilfs.h | 2 --
fs/nilfs2/segment.c | 44 +++++++++++++++++++++++++++++++++++++++-----
fs/nilfs2/segment.h | 5 +++++
3 files changed, 44 insertions(+), 7 deletions(-)
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
*/
struct nilfs_transaction_info {
u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
one of other filesystems has a bug. */
unsigned short ti_flags;
unsigned short ti_count;
- struct list_head ti_garbage;
};
/* ti_magic */
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struc
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
- INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(str
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
- if (!list_empty(&ti->ti_garbage))
- nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct th
}
}
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+ struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+ sc_iput_work);
+ struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
struct nilfs_root *root)
{
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_f
static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+ int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_f
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_del_init(&ii->i_dirty);
+ if (!ii->vfs_inode.i_nlink) {
+ /*
+ * Defer calling iput() to avoid a deadlock
+ * over I_SYNC flag for inodes with i_nlink == 0
+ */
+ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+ defer_iput = true;
+ } else {
+ spin_unlock(&nilfs->ns_inode_lock);
+ iput(&ii->vfs_inode);
+ spin_lock(&nilfs->ns_inode_lock);
+ }
}
spin_unlock(&nilfs->ns_inode_lock);
+
+ if (defer_iput)
+ schedule_work(&sci->sc_iput_work);
}
/*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segct
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
+ INIT_LIST_HEAD(&sci->sc_iput_queue);
+ INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(stru
ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
nilfs_transaction_unlock(sci->sc_super);
+ flush_work(&sci->sc_iput_work);
+
} while (ret && retrycount-- > 0);
}
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ if (flush_work(&sci->sc_iput_work))
+ flag = true;
+
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
+ if (!list_empty(&sci->sc_iput_queue)) {
+ nilfs_warning(sci->sc_super, __func__,
+ "iput queue is not empty\n");
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+ }
+
WARN_ON(!list_empty(&sci->sc_segbufs));
WARN_ON(!list_empty(&sci->sc_write_logs));
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
+ struct list_head sc_iput_queue;
+ struct work_struct sc_iput_work;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
next prev parent reply other threads:[~2015-02-09 8:39 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-02-09 8:33 [PATCH 3.18 00/39] 3.18.7-stable review Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 01/39] gpio: sysfs: fix memory leak in gpiod_export_link Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 02/39] gpio: sysfs: fix memory leak in gpiod_sysfs_set_active_low Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 03/39] gpio: mcp23s08: handle default gpio base Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 04/39] PCI: designware: Reject MSI-X IRQs Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 05/39] PCI: Add NEC variants to Stratus ftServer PCIe DMI check Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 06/39] PCI: Handle read-only BARs on AMD CS553x devices Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 07/39] spi: spi-fsl-dspi: Remove usage of devm_kzalloc Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 08/39] spi: imx: use pio mode for i.mx6dl Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 09/39] sd: Fix max transfer length for 4k disks Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 10/39] MIPS: Fix C0_Pagegrain[IEC] support Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 11/39] MIPS: IRQ: Fix disable_irq on CPU IRQs Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 12/39] MIPS: OCTEON: fix kernel crash when offlining a CPU Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 13/39] MIPS: Fix kernel lockup or crash after CPU offline/online Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 14/39] MIPS: mipsregs.h: Add write_32bit_cp1_register() Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 15/39] MIPS: traps: Fix inline asm ctc1 missing .set hardfloat Greg Kroah-Hartman
2015-02-09 8:33 ` [PATCH 3.18 16/39] ARM: 8299/1: mm: ensure local active ASID is marked as allocated on rollover Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 17/39] Complete oplock break jobs before closing file handle Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 18/39] md/raid5: fix another livelock caused by non-aligned writes Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 19/39] mm: pagewalk: call pte_hole() for VM_PFNMAP during walk_page_range Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 20/39] lib/checksum.c: fix carry in csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 21/39] memcg, shmem: fix shmem migration to use lrucare Greg Kroah-Hartman
2015-02-09 8:34 ` Greg Kroah-Hartman [this message]
2015-02-09 8:34 ` [PATCH 3.18 23/39] drm/radeon: dont init gpuvm if accel is disabled (v3) Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 27/39] drm/radeon: properly set vm fragment size for TN/RL Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 29/39] arm64: Fix up /proc/cpuinfo Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 30/39] lib/checksum.c: fix build for generic csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 31/39] ASoC: atmel_ssc_dai: fix start event for I2S mode Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 32/39] ASoC: sgtl5000: add delay before first I2C access Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 33/39] ALSA: ak411x: Fix stall in work callback Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 34/39] ARM: dts: Fix I2S1, I2S2 compatible for exynos4 SoCs Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 35/39] x86, microcode: Return error from driver init code when loader is disabled Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 36/39] smpboot: Add missing get_online_cpus() in smpboot_register_percpu_thread() Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 37/39] hrtimer: Fix incorrect tai offset calculation for non high-res timer systems Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 38/39] tracing: Add condition check to RCU lockdep checks Greg Kroah-Hartman
2015-02-09 8:34 ` [PATCH 3.18 39/39] x86/tlb/trace: Do not trace on CPU that is offline Greg Kroah-Hartman
2015-02-09 16:40 ` [PATCH 3.18 00/39] 3.18.7-stable review Guenter Roeck
2015-02-09 18:27 ` Shuah Khan
2015-02-09 21:37 ` Greg Kroah-Hartman
2015-02-09 22:03 ` Guenter Roeck
2015-02-09 22:24 ` Winkler, Tomas
2015-02-09 23:19 ` Shuah Khan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150209083329.832240939@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=akpm@linux-foundation.org \
--cc=konishi.ryusuke@lab.ntt.co.jp \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox