From: Dave Chinner <david@fromorbit.com>
To: stable@kernel.org
Cc: xfs@oss.sgi.com
Subject: [PATCH 18/19] xfs: Non-blocking inode locking in IO completion
Date: Fri, 12 Mar 2010 09:42:16 +1100 [thread overview]
Message-ID: <1268347337-7160-19-git-send-email-david@fromorbit.com> (raw)
In-Reply-To: <1268347337-7160-1-git-send-email-david@fromorbit.com>
>From 77d7a0c2eeb285c9069e15396703d0cb9690ac50
Date: Wed, 17 Feb 2010 05:36:29 +0000
The introduction of barriers to loop devices has created a new IO
order completion dependency that XFS does not handle. The loop
device implements barriers using fsync and so turns a log IO in the
XFS filesystem on the loop device into a data IO in the backing
filesystem. That is, the completion of log IOs in the loop
filesystem are now dependent on completion of data IO in the backing
filesystem.
This can cause deadlocks when a flush daemon issues a log force with
an inode locked because the IO completion of IO on the inode is
blocked by the inode lock. This in turn prevents further data IO
completion from occuring on all XFS filesystems on that CPU (due to
the shared nature of the completion queues). This then prevents the
log IO from completing because the log is waiting for data IO
completion as well.
The fix for this new completion order dependency issue is to make
the IO completion inode locking non-blocking. If the inode lock
can't be grabbed, simply requeue the IO completion back to the work
queue so that it can be processed later. This prevents the
completion queue from being blocked and allows data IO completion on
other inodes to proceed, hence avoiding completion order dependent
deadlocks.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
fs/xfs/linux-2.6/xfs_aops.c | 118 ++++++++++++++++++++++++++++++-------------
1 files changed, 82 insertions(+), 36 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c2e30ee..7263002 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -204,14 +204,17 @@ xfs_ioend_new_eof(
}
/*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size. If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated. If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk. The
+ * current in-memory file size is i_size. If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated. If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
*/
-
-STATIC void
+STATIC int
xfs_setfilesize(
xfs_ioend_t *ioend)
{
@@ -222,9 +225,11 @@ xfs_setfilesize(
ASSERT(ioend->io_type != IOMAP_READ);
if (unlikely(ioend->io_error))
- return;
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_ioend_new_eof(ioend);
if (isize) {
ip->i_d.di_size = isize;
@@ -232,6 +237,28 @@ xfs_setfilesize(
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+ xfs_ioend_t *ioend,
+ int wait)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct workqueue_struct *wq;
+
+ wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ xfsconvertd_workqueue : xfsdatad_workqueue;
+ queue_work(wq, &ioend->io_work);
+ if (wait)
+ flush_workqueue(wq);
+ }
}
/*
@@ -243,9 +270,23 @@ xfs_end_bio_delalloc(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -257,9 +298,23 @@ xfs_end_bio_written(
{
xfs_ioend_t *ioend =
container_of(work, xfs_ioend_t, io_work);
+ int error;
- xfs_setfilesize(ioend);
- xfs_destroy_ioend(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ ASSERT(!error);
+ xfs_destroy_ioend(ioend);
+ }
}
/*
@@ -279,13 +334,25 @@ xfs_end_bio_unwritten(
size_t size = ioend->io_size;
if (likely(!ioend->io_error)) {
+ int error;
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- int error;
error = xfs_iomap_write_unwritten(ip, offset, size);
if (error)
ioend->io_error = error;
}
- xfs_setfilesize(ioend);
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ error = xfs_setfilesize(ioend);
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend, 0);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ return;
+ }
}
xfs_destroy_ioend(ioend);
}
@@ -304,27 +371,6 @@ xfs_end_bio_read(
}
/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- struct workqueue_struct *wq = xfsdatad_workqueue;
- if (ioend->io_work.func == xfs_end_bio_unwritten)
- wq = xfsconvertd_workqueue;
-
- queue_work(wq, &ioend->io_work);
- if (wait)
- flush_workqueue(wq);
- }
-}
-
-/*
* Allocate and initialise an IO completion structure.
* We need to track unwritten extent write completion here initially.
* We'll need to extend this for updating the ondisk inode size later
--
1.6.5
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
prev parent reply other threads:[~2010-03-11 22:41 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <1268347337-7160-1-git-send-email-david@fromorbit.com>
2010-03-11 22:41 ` [PATCH 01/19] xfs: simplify inode teardown Dave Chinner
2010-03-11 22:42 ` [PATCH 02/19] xfs: fix mmap_sem/iolock inversion in xfs_free_eofblocks Dave Chinner
2010-03-11 22:42 ` [PATCH 03/19] xfs: I/O completion handlers must use NOFS allocations Dave Chinner
2010-03-11 22:42 ` [PATCH 04/19] xfs: Wrapped journal record corruption on read at recovery Dave Chinner
2010-03-11 22:42 ` [PATCH 05/19] xfs: Fix error return for fallocate() on XFS Dave Chinner
2010-03-11 22:42 ` [PATCH 06/19] xfs: check for not fully initialized inodes in xfs_ireclaim Dave Chinner
2010-03-11 22:42 ` [PATCH 07/19] xfs: fix timestamp handling in xfs_setattr Dave Chinner
2010-03-11 22:42 ` [PATCH 08/19] xfs: Don't flush stale inodes Dave Chinner
2010-03-11 22:42 ` [PATCH 09/19] xfs: Ensure we force all busy extents in range to disk Dave Chinner
2010-03-11 22:42 ` [PATCH 10/19] xfs: reclaim inodes under a write lock Dave Chinner
2010-03-11 22:42 ` [PATCH 11/19] xfs: Avoid inodes in reclaim when flushing from inode cache Dave Chinner
2010-03-11 22:42 ` [PATCH 12/19] xfs: reclaim all inodes by background tree walks Dave Chinner
2010-03-11 22:42 ` [PATCH 13/19] xfs: fix stale inode flush avoidance Dave Chinner
2010-03-11 22:42 ` [PATCH 14/19] xfs: xfs_swap_extents needs to handle dynamic fork offsets Dave Chinner
2010-03-11 22:42 ` [PATCH 15/19] xfs: quota limit statvfs available blocks Dave Chinner
2010-03-11 22:42 ` [PATCH 16/19] xfs: don't hold onto reserved blocks on remount,ro Dave Chinner
2010-03-11 22:42 ` [PATCH 17/19] xfs: remove invalid barrier optimization from xfs_fsync Dave Chinner
2010-03-11 22:42 ` Dave Chinner [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1268347337-7160-19-git-send-email-david@fromorbit.com \
--to=david@fromorbit.com \
--cc=stable@kernel.org \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox