All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org,
	Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>,
	Al Viro <viro@zeniv.linux.org.uk>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>
Subject: [PATCH 3.18 22/39] nilfs2: fix deadlock of segment constructor over I_SYNC flag
Date: Mon,  9 Feb 2015 16:34:05 +0800	[thread overview]
Message-ID: <20150209083329.832240939@linuxfoundation.org> (raw)
In-Reply-To: <20150209083328.753647350@linuxfoundation.org>

3.18-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>

commit 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 upstream.

Nilfs2 eventually hangs in a stress test with fsstress program.  This
issue was caused by the following deadlock over I_SYNC flag between
nilfs_segctor_thread() and writeback_sb_inodes():

  nilfs_segctor_thread()
    nilfs_segctor_thread_construct()
      nilfs_segctor_unlock()
        nilfs_dispose_list()
          iput()
            iput_final()
              evict()
                inode_wait_for_writeback()  * wait for I_SYNC flag

  writeback_sb_inodes()
     * set I_SYNC flag on inode->i_state
    __writeback_single_inode()
      do_writepages()
        nilfs_writepages()
          nilfs_construct_dsync_segment()
            nilfs_segctor_sync()
               * wait for completion of segment constructor
    inode_sync_complete()
       * clear I_SYNC flag after __writeback_single_inode() completed

writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state.  do_writepages() in turn calls
nilfs_writepages(), which can run segment constructor and wait for its
completion.  On the other hand, segment constructor calls iput(), which
can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().

Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has a
non-zero count.  We can prevent evict() from being called in iput() by
implementing sop->drop_inode(), but it's not preferable to leave inodes
with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation.  So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 fs/nilfs2/nilfs.h   |    2 --
 fs/nilfs2/segment.c |   44 +++++++++++++++++++++++++++++++++++++++-----
 fs/nilfs2/segment.h |    5 +++++
 3 files changed, 44 insertions(+), 7 deletions(-)

--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
  * @ti_save: Backup of journal_info field of task_struct
  * @ti_flags: Flags
  * @ti_count: Nest level
- * @ti_garbage:	List of inode to be put when releasing semaphore
  */
 struct nilfs_transaction_info {
 	u32			ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
 				   one of other filesystems has a bug. */
 	unsigned short		ti_flags;
 	unsigned short		ti_count;
-	struct list_head	ti_garbage;
 };
 
 /* ti_magic */
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struc
 	ti->ti_count = 0;
 	ti->ti_save = cur_ti;
 	ti->ti_magic = NILFS_TI_MAGIC;
-	INIT_LIST_HEAD(&ti->ti_garbage);
 	current->journal_info = ti;
 
 	for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(str
 
 	up_write(&nilfs->ns_segctor_sem);
 	current->journal_info = ti->ti_save;
-	if (!list_empty(&ti->ti_garbage))
-		nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct th
 	}
 }
 
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+	struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+						 sc_iput_work);
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+	nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
 				     struct nilfs_root *root)
 {
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_f
 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
-	struct nilfs_transaction_info *ti = current->journal_info;
 	struct nilfs_inode_info *ii, *n;
+	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
 	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_f
 		clear_bit(NILFS_I_BUSY, &ii->i_state);
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
-		list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+		list_del_init(&ii->i_dirty);
+		if (!ii->vfs_inode.i_nlink) {
+			/*
+			 * Defer calling iput() to avoid a deadlock
+			 * over I_SYNC flag for inodes with i_nlink == 0
+			 */
+			list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+			defer_iput = true;
+		} else {
+			spin_unlock(&nilfs->ns_inode_lock);
+			iput(&ii->vfs_inode);
+			spin_lock(&nilfs->ns_inode_lock);
+		}
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
+
+	if (defer_iput)
+		schedule_work(&sci->sc_iput_work);
 }
 
 /*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segct
 	INIT_LIST_HEAD(&sci->sc_segbufs);
 	INIT_LIST_HEAD(&sci->sc_write_logs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
+	INIT_LIST_HEAD(&sci->sc_iput_queue);
+	INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
 	init_timer(&sci->sc_timer);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(stru
 		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
 		nilfs_transaction_unlock(sci->sc_super);
 
+		flush_work(&sci->sc_iput_work);
+
 	} while (ret && retrycount-- > 0);
 }
 
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct
 		|| sci->sc_seq_request != sci->sc_seq_done);
 	spin_unlock(&sci->sc_state_lock);
 
+	if (flush_work(&sci->sc_iput_work))
+		flag = true;
+
 	if (flag || !nilfs_segctor_confirm(sci))
 		nilfs_segctor_write_out(sci);
 
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct
 		nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
 	}
 
+	if (!list_empty(&sci->sc_iput_queue)) {
+		nilfs_warning(sci->sc_super, __func__,
+			      "iput queue is not empty\n");
+		nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+	}
+
 	WARN_ON(!list_empty(&sci->sc_segbufs));
 	WARN_ON(!list_empty(&sci->sc_write_logs));
 
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/workqueue.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
  * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
  * @sc_freesegs: array of segment numbers to be freed
  * @sc_nfreesegs: number of segments on @sc_freesegs
  * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
 
 	struct list_head	sc_dirty_files;
 	struct list_head	sc_gc_inodes;
+	struct list_head	sc_iput_queue;
+	struct work_struct	sc_iput_work;
 
 	__u64		       *sc_freesegs;
 	size_t			sc_nfreesegs;



  parent reply	other threads:[~2015-02-09  8:39 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-09  8:33 [PATCH 3.18 00/39] 3.18.7-stable review Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 01/39] gpio: sysfs: fix memory leak in gpiod_export_link Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 02/39] gpio: sysfs: fix memory leak in gpiod_sysfs_set_active_low Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 03/39] gpio: mcp23s08: handle default gpio base Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 04/39] PCI: designware: Reject MSI-X IRQs Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 05/39] PCI: Add NEC variants to Stratus ftServer PCIe DMI check Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 06/39] PCI: Handle read-only BARs on AMD CS553x devices Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 07/39] spi: spi-fsl-dspi: Remove usage of devm_kzalloc Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 08/39] spi: imx: use pio mode for i.mx6dl Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 09/39] sd: Fix max transfer length for 4k disks Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 10/39] MIPS: Fix C0_Pagegrain[IEC] support Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 11/39] MIPS: IRQ: Fix disable_irq on CPU IRQs Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 12/39] MIPS: OCTEON: fix kernel crash when offlining a CPU Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 13/39] MIPS: Fix kernel lockup or crash after CPU offline/online Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 14/39] MIPS: mipsregs.h: Add write_32bit_cp1_register() Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 15/39] MIPS: traps: Fix inline asm ctc1 missing .set hardfloat Greg Kroah-Hartman
2015-02-09  8:33 ` [PATCH 3.18 16/39] ARM: 8299/1: mm: ensure local active ASID is marked as allocated on rollover Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 17/39] Complete oplock break jobs before closing file handle Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 18/39] md/raid5: fix another livelock caused by non-aligned writes Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 19/39] mm: pagewalk: call pte_hole() for VM_PFNMAP during walk_page_range Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 20/39] lib/checksum.c: fix carry in csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 21/39] memcg, shmem: fix shmem migration to use lrucare Greg Kroah-Hartman
2015-02-09  8:34 ` Greg Kroah-Hartman [this message]
2015-02-09  8:34 ` [PATCH 3.18 23/39] drm/radeon: dont init gpuvm if accel is disabled (v3) Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 27/39] drm/radeon: properly set vm fragment size for TN/RL Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 29/39] arm64: Fix up /proc/cpuinfo Greg Kroah-Hartman
2015-02-09  8:34   ` Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 30/39] lib/checksum.c: fix build for generic csum_tcpudp_nofold Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 31/39] ASoC: atmel_ssc_dai: fix start event for I2S mode Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 32/39] ASoC: sgtl5000: add delay before first I2C access Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 33/39] ALSA: ak411x: Fix stall in work callback Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 34/39] ARM: dts: Fix I2S1, I2S2 compatible for exynos4 SoCs Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 35/39] x86, microcode: Return error from driver init code when loader is disabled Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 36/39] smpboot: Add missing get_online_cpus() in smpboot_register_percpu_thread() Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 37/39] hrtimer: Fix incorrect tai offset calculation for non high-res timer systems Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 38/39] tracing: Add condition check to RCU lockdep checks Greg Kroah-Hartman
2015-02-09  8:34 ` [PATCH 3.18 39/39] x86/tlb/trace: Do not trace on CPU that is offline Greg Kroah-Hartman
2015-02-09 16:40 ` [PATCH 3.18 00/39] 3.18.7-stable review Guenter Roeck
2015-02-09 18:27 ` Shuah Khan
2015-02-09 21:37   ` Greg Kroah-Hartman
2015-02-09 22:03     ` Guenter Roeck
2015-02-09 22:24       ` Winkler, Tomas
2015-02-09 23:19         ` Shuah Khan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150209083329.832240939@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=akpm@linux-foundation.org \
    --cc=konishi.ryusuke@lab.ntt.co.jp \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.