[PATCH 04/27] xfs: sync eofblocks scans under iolock are livelock prone

linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Christoph Hellwig <hch@lst.de>
Cc: linux-xfs@vger.kernel.org, Brian Foster <bfoster@redhat.com>,
	"Darrick J . Wong" <darrick.wong@oracle.com>
Subject: [PATCH 04/27] xfs: sync eofblocks scans under iolock are livelock prone
Date: Mon, 27 Mar 2017 10:48:59 +0200	[thread overview]
Message-ID: <20170327084922.11691-5-hch@lst.de> (raw)
In-Reply-To: <20170327084922.11691-1-hch@lst.de>

From: Brian Foster <bfoster@redhat.com>

commit c3155097ad89a956579bc305856a1f2878494e52 upstream.

The xfs_eofblocks.eof_scan_owner field is an internal field to
facilitate invoking eofb scans from the kernel while under the iolock.
This is necessary because the eofb scan acquires the iolock of each
inode. Synchronous scans are invoked on certain buffered write failures
while under iolock. In such cases, the scan owner indicates that the
context for the scan already owns the particular iolock and prevents a
double lock deadlock.

eofblocks scans while under iolock are still livelock prone in the event
of multiple parallel scans, however. If multiple buffered writes to
different inodes fail and invoke eofblocks scans at the same time, each
scan avoids a deadlock with its own inode by virtue of the
eof_scan_owner field, but will never be able to acquire the iolock of
the inode from the parallel scan. Because the low free space scans are
invoked with SYNC_WAIT, the scan will not return until it has processed
every tagged inode and thus both scans will spin indefinitely on the
iolock being held across the opposite scan. This problem can be
reproduced reliably by generic/224 on systems with higher cpu counts
(x16).

To avoid this problem, simplify the semantics of eofblocks scans to
never invoke a scan while under iolock. This means that the buffered
write context must drop the iolock before the scan. It must reacquire
the lock before the write retry and also repeat the initial write
checks, as the original state might no longer be valid once the iolock
was dropped.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_file.c   | 13 +++++++++----
 fs/xfs/xfs_icache.c | 45 +++++++--------------------------------------
 fs/xfs/xfs_icache.h |  2 --
 3 files changed, 16 insertions(+), 44 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a5d64b5f35a..81b7fee40b54 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -675,8 +675,10 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
-	int			iolock = XFS_IOLOCK_EXCL;
+	int			iolock;
 
+write_retry:
+	iolock = XFS_IOLOCK_EXCL;
 	xfs_rw_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@@ -686,7 +688,6 @@ xfs_file_buffered_aio_write(
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 
-write_retry:
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
@@ -702,18 +703,21 @@ xfs_file_buffered_aio_write(
 	 * running at the same time.
 	 */
 	if (ret == -EDQUOT && !enospc) {
+		xfs_rw_iunlock(ip, iolock);
 		enospc = xfs_inode_free_quota_eofblocks(ip);
 		if (enospc)
 			goto write_retry;
 		enospc = xfs_inode_free_quota_cowblocks(ip);
 		if (enospc)
 			goto write_retry;
+		iolock = 0;
 	} else if (ret == -ENOSPC && !enospc) {
 		struct xfs_eofblocks eofb = {0};
 
 		enospc = 1;
 		xfs_flush_inodes(ip->i_mount);
-		eofb.eof_scan_owner = ip->i_ino; /* for locking */
+
+		xfs_rw_iunlock(ip, iolock);
 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 		goto write_retry;
@@ -721,7 +725,8 @@ xfs_file_buffered_aio_write(
 
 	current->backing_dev_info = NULL;
 out:
-	xfs_rw_iunlock(ip, iolock);
+	if (iolock)
+		xfs_rw_iunlock(ip, iolock);
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e4b382aabf9f..78708d001a63 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1326,11 +1326,8 @@ xfs_inode_free_eofblocks(
 {
 	int ret = 0;
 	struct xfs_eofblocks *eofb = args;
-	bool need_iolock = true;
 	int match;
 
-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
-
 	if (!xfs_can_free_eofblocks(ip, false)) {
 		/* inode could be preallocated or append-only */
 		trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1358,27 +1355,19 @@ xfs_inode_free_eofblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
-
-		/*
-		 * A scan owner implies we already hold the iolock. Skip it here
-		 * to avoid deadlock.
-		 */
-		if (eofb->eof_scan_owner == ip->i_ino)
-			need_iolock = false;
 	}
 
 	/*
 	 * If the caller is waiting, return -EAGAIN to keep the background
 	 * scanner moving and revisit the inode in a subsequent pass.
 	 */
-	if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 		if (flags & SYNC_WAIT)
 			ret = -EAGAIN;
 		return ret;
 	}
 	ret = xfs_free_eofblocks(ip);
-	if (need_iolock)
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 	return ret;
 }
@@ -1425,15 +1414,10 @@ __xfs_inode_free_quota_eofblocks(
 	struct xfs_eofblocks eofb = {0};
 	struct xfs_dquot *dq;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
 	/*
-	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
-	 * can repeatedly trylock on the inode we're currently processing. We
-	 * run a sync scan to increase effectiveness and use the union filter to
+	 * Run a sync scan to increase effectiveness and use the union filter to
 	 * cover all applicable quotas in a single scan.
 	 */
-	eofb.eof_scan_owner = ip->i_ino;
 	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
 
 	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
@@ -1585,12 +1569,9 @@ xfs_inode_free_cowblocks(
 {
 	int ret;
 	struct xfs_eofblocks *eofb = args;
-	bool need_iolock = true;
 	int match;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 
-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
-
 	/*
 	 * Just clear the tag if we have an empty cow fork or none at all. It's
 	 * possible the inode was fully unshared since it was originally tagged.
@@ -1623,28 +1604,16 @@ xfs_inode_free_cowblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
-
-		/*
-		 * A scan owner implies we already hold the iolock. Skip it in
-		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
-		 * the possibility of EAGAIN being returned.
-		 */
-		if (eofb->eof_scan_owner == ip->i_ino)
-			need_iolock = false;
 	}
 
 	/* Free the CoW blocks */
-	if (need_iolock) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	}
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 
 	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
 
-	if (need_iolock) {
-		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	}
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 	return ret;
 }
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a1e02f4708ab..8a7c849b4dea 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,7 +27,6 @@ struct xfs_eofblocks {
 	kgid_t		eof_gid;
 	prid_t		eof_prid;
 	__u64		eof_min_file_size;
-	xfs_ino_t	eof_scan_owner;
 };
 
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
 	dst->eof_flags = src->eof_flags;
 	dst->eof_prid = src->eof_prid;
 	dst->eof_min_file_size = src->eof_min_file_size;
-	dst->eof_scan_owner = NULLFSINO;
 
 	dst->eof_uid = INVALID_UID;
 	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
-- 
2.11.0

next prev parent reply	other threads:[~2017-03-27  8:49 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-27  8:48 Christoph Hellwig
2017-03-27  8:48 ` [PATCH 01/27] xfs: only update mount/resv fields on success in __xfs_ag_resv_init Christoph Hellwig
2017-03-27  8:48 ` [PATCH 02/27] xfs: use per-AG reservations for the finobt Christoph Hellwig
2017-03-27  8:48 ` [PATCH 03/27] xfs: pull up iolock from xfs_free_eofblocks() Christoph Hellwig
2017-03-27  8:48 ` Christoph Hellwig [this message]
2017-03-27  8:49 ` [PATCH 05/27] xfs: fix eofblocks race with file extending async dio writes Christoph Hellwig
2017-03-27  8:49 ` [PATCH 06/27] xfs: fix toctou race when locking an inode to access the data map Christoph Hellwig
2017-03-27  8:49 ` [PATCH 07/27] xfs: fail _dir_open when readahead fails Christoph Hellwig
2017-03-27  8:49 ` [PATCH 08/27] xfs: filter out obviously bad btree pointers Christoph Hellwig
2017-03-27  8:49 ` [PATCH 09/27] xfs: check for obviously bad level values in the bmbt root Christoph Hellwig
2017-03-27  8:49 ` [PATCH 10/27] xfs: verify free block header fields Christoph Hellwig
2017-03-27  8:49 ` [PATCH 11/27] xfs: allow unwritten extents in the CoW fork Christoph Hellwig
2017-03-27  8:49 ` [PATCH 12/27] xfs: mark speculative prealloc CoW fork extents unwritten Christoph Hellwig
2017-03-27  8:49 ` [PATCH 13/27] xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t Christoph Hellwig
2017-03-27  8:49 ` [PATCH 14/27] xfs: reject all unaligned direct writes to reflinked files Christoph Hellwig
2017-03-27  8:49 ` [PATCH 15/27] xfs: update ctime and mtime on clone destinatation inodes Christoph Hellwig
2017-03-27  8:49 ` [PATCH 16/27] xfs: don't fail xfs_extent_busy allocation Christoph Hellwig
2017-03-27  8:49 ` [PATCH 17/27] xfs: handle indlen shortage on delalloc extent merge Christoph Hellwig
2017-03-27  8:49 ` [PATCH 18/27] xfs: split indlen reservations fairly when under reserved Christoph Hellwig
2017-03-27  8:49 ` [PATCH 19/27] xfs: fix uninitialized variable in _reflink_convert_cow Christoph Hellwig
2017-03-27  8:49 ` [PATCH 20/27] xfs: don't reserve blocks for right shift transactions Christoph Hellwig
2017-03-27  8:49 ` [PATCH 21/27] xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment Christoph Hellwig
2017-03-27  8:49 ` [PATCH 22/27] xfs: tune down agno asserts in the bmap code Christoph Hellwig
2017-03-27  8:49 ` [PATCH 23/27] xfs: only reclaim unwritten COW extents periodically Christoph Hellwig
2017-03-27  8:49 ` [PATCH 24/27] xfs: fix and streamline error handling in xfs_end_io Christoph Hellwig
2017-03-27  8:49 ` [PATCH 25/27] xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask Christoph Hellwig
2017-03-27  8:49 ` [PATCH 26/27] xfs: use iomap new flag for newly allocated delalloc blocks Christoph Hellwig
2017-03-27  8:49 ` [PATCH 27/27] xfs: try any AG when allocating the first btree block when reflinking Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2017-04-01  6:39 4.9-stable updates for XFS Christoph Hellwig
2017-04-01  6:40 ` [PATCH 04/27] xfs: sync eofblocks scans under iolock are livelock prone Christoph Hellwig

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9a5d64b5f35 dfblob:81b7fee40b5 dfblob:e4b382aabf9
dfblob:78708d001a6 dfblob:a1e02f4708a dfblob:8a7c849b4de )
 OR (
bs:"[PATCH 04/27] xfs: sync eofblocks scans under iolock are livelock prone" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170327084922.11691-5-hch@lst.de \
    --to=hch@lst.de \
    --cc=bfoster@redhat.com \
    --cc=darrick.wong@oracle.com \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).