* [PATCH 1/2] XFS: Prevent unwritten extent conversion from blocking I/O completion
2009-03-15 11:40 [PATCH 0/2] Fix a couple of random hangs Dave Chinner
@ 2009-03-15 11:40 ` Dave Chinner
2009-03-16 9:21 ` Christoph Hellwig
2009-03-15 11:40 ` [PATCH 2/2] XFS: Inform the xfsaild of the push target before sleeping Dave Chinner
1 sibling, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2009-03-15 11:40 UTC (permalink / raw)
To: xfs
Unwritten extent conversion can recurse back into the filesystem due
to memory allocation. Memory reclaim requires I/O completions to be
processed to allow the callers to make progress. If the I/O
completion workqueue thread is doing the recursion, then we have a
deadlock situation.
Move unwritten extent completion into it's own workqueue so it
doesn't block I/O completions for normal delayed allocation or
overwrite data.
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
fs/xfs/linux-2.6/xfs_aops.c | 38 +++++++++++++++++++++-----------------
fs/xfs/linux-2.6/xfs_aops.h | 1 +
fs/xfs/linux-2.6/xfs_buf.c | 9 +++++++++
3 files changed, 31 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198..763201d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
}
/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend,
- int wait)
-{
- if (atomic_dec_and_test(&ioend->io_remaining)) {
- queue_work(xfsdatad_workqueue, &ioend->io_work);
- if (wait)
- flush_workqueue(xfsdatad_workqueue);
- }
-}
-
-/*
* We're now finished for good with this ioend structure.
* Update the page state via the associated buffer_heads,
* release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
}
/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+ xfs_ioend_t *ioend,
+ int wait)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct workqueue_struct *wq = xfsdatad_workqueue;
+ if (ioend->io_work.func == xfs_end_bio_unwritten)
+ wq = xfsconvertd_workqueue;
+
+ queue_work(wq, &ioend->io_work);
+ if (wait)
+ flush_workqueue(wq);
+ }
+}
+
+/*
* Allocate and initialise an IO completion structure.
* We need to track unwritten extent write completion here initially.
* We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd5288..221b3e6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
#define __XFS_AOPS_H__
extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016b..e28800a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
#ifdef XFS_BUF_TRACE
void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
xfs_buf_t *bp, *n;
int pincount = 0;
+ xfs_buf_runall_queues(xfsconvertd_workqueue);
xfs_buf_runall_queues(xfsdatad_workqueue);
xfs_buf_runall_queues(xfslogd_workqueue);
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
if (!xfsdatad_workqueue)
goto out_destroy_xfslogd_workqueue;
+ xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+ if (!xfsconvertd_workqueue)
+ goto out_destroy_xfsdatad_workqueue;
+
register_shrinker(&xfs_buf_shake);
return 0;
+ out_destroy_xfsdatad_workqueue:
+ destroy_workqueue(xfsdatad_workqueue);
out_destroy_xfslogd_workqueue:
destroy_workqueue(xfslogd_workqueue);
out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
xfs_buf_terminate(void)
{
unregister_shrinker(&xfs_buf_shake);
+ destroy_workqueue(xfsconvertd_workqueue);
destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
--
1.6.2
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 2/2] XFS: Inform the xfsaild of the push target before sleeping
2009-03-15 11:40 [PATCH 0/2] Fix a couple of random hangs Dave Chinner
2009-03-15 11:40 ` [PATCH 1/2] XFS: Prevent unwritten extent conversion from blocking I/O completion Dave Chinner
@ 2009-03-15 11:40 ` Dave Chinner
2009-03-16 10:38 ` Christoph Hellwig
1 sibling, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2009-03-15 11:40 UTC (permalink / raw)
To: xfs
When trying to reserve log space, we find the amount of space
we need, then go to sleep waiting for space. When we are
woken, we try to push the tail of the log forward to make
sure we have space available.
Unfortunately, this means that if there is not space available, and
everyone who needs space goes to sleep there is no-one left to push
the tail of the log to make space available. Once we have a thread
waiting for space to become available, the others queue up behind
it in a FIFO, and none of them push the tail of the log.
This can result in everyone going to sleep in xlog_grant_log_space()
if the first sleeper races with the last I/O that moves the tail
of the log forward. With no further I/O tomove the tail of the log,
there is nothing to wake the sleepers and hence all transactions
just stop.
Fix this by making sure the xfsaild will create enough space for the
transaction that is about to sleep by moving the push target far
enough forwards to ensure that that the curent proceeees will have
enough space available when it is woken. That is, we push the
AIL before we go to sleep.
Because we've inserted the log ticket into the queue before we've
pushed and gone to sleep, subsequent transactions will wait behind
this one. Hence we are guaranteed to have space available when we
are woken.
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
fs/xfs/xfs_log.c | 37 +++++++++++++++++++------------------
1 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 60e6e63..278960c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2573,18 +2573,19 @@ redo:
xlog_ins_ticketq(&log->l_reserve_headq, tic);
xlog_trace_loggrant(log, tic,
"xlog_grant_log_space: sleep 2");
+ spin_unlock(&log->l_grant_lock);
+ xlog_grant_push_ail(log->l_mp, need_bytes);
+ spin_lock(&log->l_grant_lock);
+
XFS_STATS_INC(xs_sleep_logspace);
sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
- if (XLOG_FORCED_SHUTDOWN(log)) {
- spin_lock(&log->l_grant_lock);
+ spin_lock(&log->l_grant_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
- }
xlog_trace_loggrant(log, tic,
"xlog_grant_log_space: wake 2");
- xlog_grant_push_ail(log->l_mp, need_bytes);
- spin_lock(&log->l_grant_lock);
goto redo;
} else if (tic->t_flags & XLOG_TIC_IN_Q)
xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2663,7 +2664,7 @@ xlog_regrant_write_log_space(xlog_t *log,
* for more free space, otherwise try to get some space for
* this transaction.
*/
-
+ need_bytes = tic->t_unit_res;
if ((ntic = log->l_write_headq)) {
free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
log->l_grant_write_bytes);
@@ -2683,26 +2684,25 @@ xlog_regrant_write_log_space(xlog_t *log,
xlog_trace_loggrant(log, tic,
"xlog_regrant_write_log_space: sleep 1");
+ spin_unlock(&log->l_grant_lock);
+ xlog_grant_push_ail(log->l_mp, need_bytes);
+ spin_lock(&log->l_grant_lock);
+
XFS_STATS_INC(xs_sleep_logspace);
sv_wait(&tic->t_wait, PINOD|PLTWAIT,
&log->l_grant_lock, s);
/* If we're shutting down, this tic is already
* off the queue */
- if (XLOG_FORCED_SHUTDOWN(log)) {
- spin_lock(&log->l_grant_lock);
+ spin_lock(&log->l_grant_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
- }
xlog_trace_loggrant(log, tic,
"xlog_regrant_write_log_space: wake 1");
- xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
- spin_lock(&log->l_grant_lock);
}
}
- need_bytes = tic->t_unit_res;
-
redo:
if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
@@ -2712,19 +2712,20 @@ redo:
if (free_bytes < need_bytes) {
if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
xlog_ins_ticketq(&log->l_write_headq, tic);
+ spin_unlock(&log->l_grant_lock);
+ xlog_grant_push_ail(log->l_mp, need_bytes);
+ spin_lock(&log->l_grant_lock);
+
XFS_STATS_INC(xs_sleep_logspace);
sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
/* If we're shutting down, this tic is already off the queue */
- if (XLOG_FORCED_SHUTDOWN(log)) {
- spin_lock(&log->l_grant_lock);
+ spin_lock(&log->l_grant_lock);
+ if (XLOG_FORCED_SHUTDOWN(log))
goto error_return;
- }
xlog_trace_loggrant(log, tic,
"xlog_regrant_write_log_space: wake 2");
- xlog_grant_push_ail(log->l_mp, need_bytes);
- spin_lock(&log->l_grant_lock);
goto redo;
} else if (tic->t_flags & XLOG_TIC_IN_Q)
xlog_del_ticketq(&log->l_write_headq, tic);
--
1.6.2
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 7+ messages in thread