public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 10:57 [PATCH 00/13] xfs: remove the xfssyncd mess Dave Chinner
@ 2012-08-30 10:57 ` Dave Chinner
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-08-30 10:57 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
functionailty together that actually have different start and stop
requirements. Kill these functions and open code the start/stop
methods for each of the background functions.

Subsequent patches will move the start/stop functions around to the
correct places to avoid races and shutdown issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |   25 ++++++++++++++++++-------
 fs/xfs/xfs_sync.c  |   30 ++++--------------------------
 fs/xfs/xfs_sync.h  |    6 ++++--
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb..116fcb8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -920,7 +920,11 @@ xfs_fs_put_super(
 
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1290,9 +1294,11 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1315,8 +1321,10 @@ xfs_fs_fill_super(
 
 	return 0;
  out_syncd_stop:
-	xfs_syncd_stop(mp);
- out_filestream_unmount:
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1335,7 +1343,10 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 9654817..13830e4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -370,7 +370,7 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-static void
+void
 xfs_syncd_queue_sync(
 	struct xfs_mount        *mp)
 {
@@ -383,7 +383,7 @@ xfs_syncd_queue_sync(
  * disk quotas.  We might need to cover the log to indicate that the
  * filesystem is idle and not frozen.
  */
-STATIC void
+void
 xfs_sync_worker(
 	struct work_struct *work)
 {
@@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim(
  * goes low. It scans as quickly as possible avoiding locked inodes or those
  * already being flushed, and once done schedules a future pass.
  */
-STATIC void
+void
 xfs_reclaim_worker(
 	struct work_struct *work)
 {
@@ -478,7 +478,7 @@ xfs_flush_inodes(
 	flush_work_sync(&mp->m_flush_work);
 }
 
-STATIC void
+void
 xfs_flush_worker(
 	struct work_struct *work)
 {
@@ -489,28 +489,6 @@ xfs_flush_worker(
 	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 
-int
-xfs_syncd_init(
-	struct xfs_mount	*mp)
-{
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
-	return 0;
-}
-
-void
-xfs_syncd_stop(
-	struct xfs_mount	*mp)
-{
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e..3f59e5b 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,10 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
+void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_worker(struct work_struct *work);
+void xfs_flush_worker(struct work_struct *work);
+void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
@ 2012-08-30 12:00 ` Dave Chinner
  2012-09-01 23:15   ` Christoph Hellwig
  2012-09-04 16:10   ` Mark Tinguely
  0 siblings, 2 replies; 41+ messages in thread
From: Dave Chinner @ 2012-08-30 12:00 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
functionailty together that actually have different start and stop
requirements. Kill these functions and open code the start/stop
methods for each of the background functions.

Subsequent patches will move the start/stop functions around to the
correct places to avoid races and shutdown issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c |   25 ++++++++++++++++++-------
 fs/xfs/xfs_sync.c  |   30 ++++--------------------------
 fs/xfs/xfs_sync.h  |    6 ++++--
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bdaf4cb..116fcb8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -920,7 +920,11 @@ xfs_fs_put_super(
 
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1290,9 +1294,11 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1315,8 +1321,10 @@ xfs_fs_fill_super(
 
 	return 0;
  out_syncd_stop:
-	xfs_syncd_stop(mp);
- out_filestream_unmount:
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1335,7 +1343,10 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 9654817..13830e4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -370,7 +370,7 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-static void
+void
 xfs_syncd_queue_sync(
 	struct xfs_mount        *mp)
 {
@@ -383,7 +383,7 @@ xfs_syncd_queue_sync(
  * disk quotas.  We might need to cover the log to indicate that the
  * filesystem is idle and not frozen.
  */
-STATIC void
+void
 xfs_sync_worker(
 	struct work_struct *work)
 {
@@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim(
  * goes low. It scans as quickly as possible avoiding locked inodes or those
  * already being flushed, and once done schedules a future pass.
  */
-STATIC void
+void
 xfs_reclaim_worker(
 	struct work_struct *work)
 {
@@ -478,7 +478,7 @@ xfs_flush_inodes(
 	flush_work_sync(&mp->m_flush_work);
 }
 
-STATIC void
+void
 xfs_flush_worker(
 	struct work_struct *work)
 {
@@ -489,28 +489,6 @@ xfs_flush_worker(
 	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 
-int
-xfs_syncd_init(
-	struct xfs_mount	*mp)
-{
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
-	return 0;
-}
-
-void
-xfs_syncd_stop(
-	struct xfs_mount	*mp)
-{
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e..3f59e5b 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,10 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
+void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_worker(struct work_struct *work);
+void xfs_flush_worker(struct work_struct *work);
+void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
@ 2012-09-01 23:15   ` Christoph Hellwig
  2012-09-04 16:10   ` Mark Tinguely
  1 sibling, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-01 23:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Thu, Aug 30, 2012 at 10:00:05PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
> functionailty together that actually have different start and stop
> requirements. Kill these functions and open code the start/stop
> methods for each of the background functions.
> 
> Subsequent patches will move the start/stop functions around to the
> correct places to avoid races and shutdown issues.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
  2012-09-01 23:15   ` Christoph Hellwig
@ 2012-09-04 16:10   ` Mark Tinguely
  1 sibling, 0 replies; 41+ messages in thread
From: Mark Tinguely @ 2012-09-04 16:10 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 08/30/12 07:00, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
> functionailty together that actually have different start and stop
> requirements. Kill these functions and open code the start/stop
> methods for each of the background functions.
>
> Subsequent patches will move the start/stop functions around to the
> correct places to avoid races and shutdown issues.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH V3 00/13] xfs: remove the xfssyncd mess
@ 2012-09-28  4:44 Dave Chinner
  2012-09-28  4:44 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
                   ` (12 more replies)
  0 siblings, 13 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

Version 3 of the patchset I described here:

http://oss.sgi.com/archives/xfs/2012-06/msg00064.html

This patchset reliably exposed the log->l_last_sync_lsn problem I
just posted a fix for.

Version 3:
- per-mount log and reclaim workqueues instead of a generic mount
  workqueue.
- reordering of some of the patches as Christoph requested.
- fixed the writeback_inodes_sb_if_idle deadlock by moving it all
  the way back up the write stack to xfs_file_aio_buffered_write
  where we were just flushing the current file to avoid deadlocking
  on it anyway.
- reintroduced xfs_flush_inodes() as a wrapper for
  writeback_inodes_sb_if_idle().
- rebased on a current TOT.

Version 2:
- fix writeback_inodes_sb_if_idle call in xfs_create()
- refreshed patch 13 before sending.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 01/13] xfs: xfs_syncd_stop must die
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 02/13] xfs: rationalise xfs_mount_wq users Dave Chinner
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
functionailty together that actually have different start and stop
requirements. Kill these functions and open code the start/stop
methods for each of the background functions.

Subsequent patches will move the start/stop functions around to the
correct places to avoid races and shutdown issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_super.c |   25 ++++++++++++++++++-------
 fs/xfs/xfs_sync.c  |   30 ++++--------------------------
 fs/xfs/xfs_sync.h  |    6 ++++--
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19e2380..59d76bd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -921,7 +921,11 @@ xfs_fs_put_super(
 	xfs_filestream_unmount(mp);
 	cancel_delayed_work_sync(&mp->m_sync_work);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1291,9 +1295,11 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1316,8 +1322,10 @@ xfs_fs_fill_super(
 
 	return 0;
  out_syncd_stop:
-	xfs_syncd_stop(mp);
- out_filestream_unmount:
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1336,7 +1344,10 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 9654817..13830e4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -370,7 +370,7 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-static void
+void
 xfs_syncd_queue_sync(
 	struct xfs_mount        *mp)
 {
@@ -383,7 +383,7 @@ xfs_syncd_queue_sync(
  * disk quotas.  We might need to cover the log to indicate that the
  * filesystem is idle and not frozen.
  */
-STATIC void
+void
 xfs_sync_worker(
 	struct work_struct *work)
 {
@@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim(
  * goes low. It scans as quickly as possible avoiding locked inodes or those
  * already being flushed, and once done schedules a future pass.
  */
-STATIC void
+void
 xfs_reclaim_worker(
 	struct work_struct *work)
 {
@@ -478,7 +478,7 @@ xfs_flush_inodes(
 	flush_work_sync(&mp->m_flush_work);
 }
 
-STATIC void
+void
 xfs_flush_worker(
 	struct work_struct *work)
 {
@@ -489,28 +489,6 @@ xfs_flush_worker(
 	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 
-int
-xfs_syncd_init(
-	struct xfs_mount	*mp)
-{
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
-	return 0;
-}
-
-void
-xfs_syncd_stop(
-	struct xfs_mount	*mp)
-{
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e..3f59e5b 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,10 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
+void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_worker(struct work_struct *work);
+void xfs_flush_worker(struct work_struct *work);
+void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 02/13] xfs: rationalise xfs_mount_wq users
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
  2012-09-28  4:44 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28 12:27   ` Christoph Hellwig
  2012-09-28  4:44 ` [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only Dave Chinner
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Instead of starting and stopping background work on the xfs_mount_wq
all at the same time, separate them to where they really are needed
to start and stop.

The xfs_sync_worker, only needs to be started after all the mount
processing has completed successfully, while it needs to be stopped
before the log is unmounted.

The xfs_reclaim_worker is started on demand, and can be
stopped before the unmount process does it's own inode reclaim pass.

The xfs_flush_inodes work is run on demand, and so we really only
need to ensure that it has stopped running before we start
processing an unmount, freeze or remount,ro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_mount.c |    6 ++++--
 fs/xfs/xfs_super.c |   34 ++++++++++++++--------------------
 fs/xfs/xfs_sync.c  |   21 +++++----------------
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 29c2f83..3152924 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1483,9 +1483,11 @@ xfs_unmountfs(
 
 	/*
 	 * And reclaim all inodes.  At this point there should be no dirty
-	 * inode, and none should be pinned or locked, but use synchronous
-	 * reclaim just to be sure.
+	 * inodes and none should be pinned or locked, but use synchronous
+	 * reclaim just to be sure. We can stop background inode reclaim
+	 * here as well if it is still running.
 	 */
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59d76bd..935b2c7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,14 +918,12 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	xfs_filestream_unmount(mp);
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	xfs_unmountfs(mp);
-
 	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	cancel_work_sync(&mp->m_flush_work);
 
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1232,6 +1230,9 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
@@ -1295,15 +1296,9 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
 	error = xfs_mountfs(mp);
 	if (error)
-		goto out_syncd_stop;
+		goto out_filestream_unmount;
 
 	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
@@ -1320,12 +1315,15 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
+	/*
+	 * The filesystem is successfully mounted, so we can start background
+	 * sync work now.
+	 */
+	xfs_syncd_queue_sync(mp);
+
 	return 0;
- out_syncd_stop:
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 
+ out_filestream_unmount:
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1344,10 +1342,6 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 13830e4..509190f 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -379,9 +379,9 @@ xfs_syncd_queue_sync(
 }
 
 /*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle and not frozen.
  */
 void
 xfs_sync_worker(
@@ -391,17 +391,7 @@ xfs_sync_worker(
 					struct xfs_mount, m_sync_work);
 	int		error;
 
-	/*
-	 * We shouldn't write/force the log if we are in the mount/unmount
-	 * process or on a read only filesystem. The workqueue still needs to be
-	 * active in both cases, however, because it is used for inode reclaim
-	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-	 * during mount.  Doing work during unmount is avoided by calling
-	 * cancel_delayed_work_sync on this work queue before tearing down
-	 * the ail and the log in xfs_log_unmount.
-	 */
-	if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		/* dgc: errors ignored here */
 		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
 		    xfs_log_need_covered(mp))
@@ -409,8 +399,7 @@ xfs_sync_worker(
 		else
 			xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently
-		 * dirty */
+		/* start pushing all the metadata that is currently dirty */
 		xfs_ail_push_all(mp->m_ail);
 	}
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
  2012-09-28  4:44 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
  2012-09-28  4:44 ` [PATCH 02/13] xfs: rationalise xfs_mount_wq users Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28 12:27   ` Christoph Hellwig
  2012-09-28  4:44 ` [PATCH 04/13] xfs: sync work is now only periodic log work Dave Chinner
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

If the filesystem is mounted or remounted read-only, stop the sync
worker that tries to flush or cover the log if the filesystem is
dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw
as necessary. This avoids the need for RO checks in the work.

Similarly, stop the sync work when the filesystem is frozen, and
start it again when the filesysetm is thawed. This avoids the need
for special freeze checks in the work.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_super.c |    2 ++
 fs/xfs/xfs_sync.c  |   29 ++++++++++++++++-------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 935b2c7..f6147ae 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1107,6 +1107,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
+		xfs_syncd_queue_sync(mp);
 	}
 
 	/* rw -> ro */
@@ -1152,6 +1153,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
+	xfs_syncd_queue_sync(mp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 509190f..5d69f23 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -323,6 +323,9 @@ xfs_quiesce_data(
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
+ *
+ * Note: this stops background sync work - the callers must ensure it is started
+ * again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -341,6 +344,9 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
+	/* stop background sync work */
+	cancel_delayed_work_sync(&mp->m_sync_work);
+
 	/*
 	 * Just warn here till VFS can correctly support
 	 * read-only remount without racing.
@@ -379,9 +385,8 @@ xfs_syncd_queue_sync(
 }
 
 /*
- * Every sync period we need to unpin all items in the AIL and push them to
- * disk. If there is nothing dirty, then we might need to cover the log to
- * indicate that the filesystem is idle and not frozen.
+ * Every sync period we need to push dirty metadata and try to cover the log
+ * to indicate the filesystem is idle and not frozen.
  */
 void
 xfs_sync_worker(
@@ -391,17 +396,15 @@ xfs_sync_worker(
 					struct xfs_mount, m_sync_work);
 	int		error;
 
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		/* dgc: errors ignored here */
-		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-		    xfs_log_need_covered(mp))
-			error = xfs_fs_log_dummy(mp);
-		else
-			xfs_log_force(mp, 0);
+	/* dgc: errors ignored here */
+	if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
+	    xfs_log_need_covered(mp))
+		error = xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently dirty */
-		xfs_ail_push_all(mp->m_ail);
-	}
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
 
 	/* queue us up again */
 	xfs_syncd_queue_sync(mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 04/13] xfs: sync work is now only periodic log work
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (2 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28 12:31   ` Christoph Hellwig
  2012-09-28  4:44 ` [PATCH 05/13] xfs: Bring some sanity to log unmounting Dave Chinner
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

The only thing the periodic sync work does now is flush the AIL and
idle the log. These are really functions of the log code, so move
the work to xfs_log.c and rename it appropriately.

The only wart that this leaves behind is the xfssyncd_centisecs
sysctl, otherwise the xfssyncd is dead. Clean up any comments that
related to xfssyncd to reflect it's passing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_log.c      |   41 ++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_log.h      |    3 +++
 fs/xfs/xfs_log_priv.h |    1 +
 fs/xfs/xfs_mount.h    |    1 -
 fs/xfs/xfs_super.c    |   16 ++++------------
 fs/xfs/xfs_sync.c     |   39 +++------------------------------------
 fs/xfs/xfs_sync.h     |    2 --
 7 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f937..a1fe7a7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -698,6 +699,8 @@ xfs_log_mount_finish(xfs_mount_t *mp)
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
 
+	xfs_log_work_queue(mp);
+
 	return error;
 }
 
@@ -858,7 +861,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
@@ -1161,6 +1164,40 @@ done:
 }	/* xlog_get_iclog_buffer_size */
 
 
+void
+xfs_log_work_queue(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+	struct work_struct	*work)
+{
+	struct xlog		*log = container_of(to_delayed_work(work),
+						struct xlog, l_work);
+	struct xfs_mount	*mp = log->l_mp;
+
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_log_need_covered(mp))
+		xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
+
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
+
+	/* queue us up again */
+	xfs_log_work_queue(mp);
+}
+
 /*
  * This routine initializes some of the log structure for a given mount point.
  * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1232,7 @@ xlog_alloc_log(
 	log->l_logBBsize   = num_bblks;
 	log->l_covered_state = XLOG_STATE_COVER_IDLE;
 	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
 
 	log->l_prev_block  = -1;
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -3700,3 +3738,4 @@ xlog_iclogs_empty(
 	} while (iclog != log->l_iclog);
 	return 1;
 }
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312..26ed7de 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,8 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_lsn_t *commit_lsn, int flags);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
+void	xfs_log_work_queue(struct xfs_mount *mp);
+void	xfs_log_worker(struct work_struct *work);
+
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d..9a4e0e5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -495,6 +495,7 @@ struct xlog {
 	struct xfs_buf		*l_xbuf;        /* extra buffer for log
 						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct delayed_work	l_work;		/* background flush work */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
 	struct list_head	*l_buf_cancel_table;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e..26e46ae 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -197,7 +197,6 @@ typedef struct xfs_mount {
 	struct mutex		m_icsb_mutex;	/* balancer sync lock */
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
-	struct delayed_work	m_sync_work;	/* background sync work */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f6147ae..b85ca2d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,7 +918,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_delayed_work_sync(&mp->m_sync_work);
 	cancel_work_sync(&mp->m_flush_work);
 
 	xfs_filestream_unmount(mp);
@@ -953,10 +952,10 @@ xfs_fs_sync_fs(
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
-		 * We schedule xfssyncd now (now that the disk is
+		 * We schedule log work now (now that the disk is
 		 * active) instead of later (when it might not be).
 		 */
-		flush_delayed_work_sync(&mp->m_sync_work);
+		flush_delayed_work_sync(&mp->m_log->l_work);
 	}
 
 	return 0;
@@ -1107,7 +1106,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
-		xfs_syncd_queue_sync(mp);
+		xfs_log_work_queue(mp);
 	}
 
 	/* rw -> ro */
@@ -1153,7 +1152,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
-	xfs_syncd_queue_sync(mp);
+	xfs_log_work_queue(mp);
 	return 0;
 }
 
@@ -1233,7 +1232,6 @@ xfs_fs_fill_super(
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
@@ -1317,12 +1315,6 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
-	/*
-	 * The filesystem is successfully mounted, so we can start background
-	 * sync work now.
-	 */
-	xfs_syncd_queue_sync(mp);
-
 	return 0;
 
  out_filestream_unmount:
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 5d69f23..7527610 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -19,6 +19,7 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
+#include "xfs_log_priv.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -344,8 +345,8 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
-	/* stop background sync work */
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 
 	/*
 	 * Just warn here till VFS can correctly support
@@ -376,40 +377,6 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-void
-xfs_syncd_queue_sync(
-	struct xfs_mount        *mp)
-{
-	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-				msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to push dirty metadata and try to cover the log
- * to indicate the filesystem is idle and not frozen.
- */
-void
-xfs_sync_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_sync_work);
-	int		error;
-
-	/* dgc: errors ignored here */
-	if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-	    xfs_log_need_covered(mp))
-		error = xfs_fs_log_dummy(mp);
-	else
-		xfs_log_force(mp, 0);
-
-	/* start pushing all the metadata that is currently dirty */
-	xfs_ail_push_all(mp->m_ail);
-
-	/* queue us up again */
-	xfs_syncd_queue_sync(mp);
-}
-
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 3f59e5b..8d58fab 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,6 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-void xfs_syncd_queue_sync(struct xfs_mount *mp);
-void xfs_sync_worker(struct work_struct *work);
 void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 05/13] xfs: Bring some sanity to log unmounting
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (3 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 04/13] xfs: sync work is now only periodic log work Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

When unmounting the filesystem, there are lots of operations that
need to be done in a specific order, and they are spread across
across a couple of functions. We have to drain the AIL before we
write the unmount record, and we have to shut down the background
log work before we do either of them.

But this is all split haphazardly across xfs_unmountfs() and
xfs_log_unmount(). Move all the AIL flushing and log manipulations
to xfs_log_unmount() so that the responisbilities of each function
is clear and the operations they perform obvious.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_log.c   |   29 ++++++++++++++++++++++++++---
 fs/xfs/xfs_mount.c |   24 ------------------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a1fe7a7..a400e71 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -853,15 +853,38 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Deallocate log structures for unmount/relocation.
+ * Shut down and release the AIL and Log.
  *
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record, tear down the AIL and finally free the log.
  */
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+	 * will push it, xfs_wait_buftarg() will not wait for it. Further,
+	 * xfs_buf_iowait() cannot be used because it was pushed with the
+	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+	 * the IO to complete.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+
+	xfs_log_unmount_write(mp);
+
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3152924..149b601 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1493,13 +1493,6 @@ xfs_unmountfs(
 	xfs_qm_unmount(mp);
 
 	/*
-	 * Flush out the log synchronously so that we know for sure
-	 * that nothing is pinned.  This is important because bflush()
-	 * will skip pinned buffers.
-	 */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/*
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
 	 * lazy superblock counting because it trusts the incore superblock
@@ -1524,23 +1517,6 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-	xfs_wait_buftarg(mp->m_ddev_targp);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-
-	xfs_log_unmount_write(mp);
 	xfs_log_unmount(mp);
 	xfs_uuid_unmount(mp);
 
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (4 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 05/13] xfs: Bring some sanity to log unmounting Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28 12:34   ` Christoph Hellwig
                     ` (2 more replies)
  2012-09-28  4:44 ` [PATCH 07/13] xfs: syncd workqueue is no more Dave Chinner
                   ` (6 subsequent siblings)
  12 siblings, 3 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with the VFS level function that
achieves the same thing, but without conflicting with current
writeback work - writeback_inodes_sb_if_idle().

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns.

This does have one complication - it cannot be called with page
locks held.  Hence move the flushing of delalloc space when ENOSPC
occurs back up into xfs_file_aio_buffered_write when we don't hold
any locks that will stall writeback.

Note that we always need to pass a count of zero to
generic_file_buffered_write() as the previously written byte count.
We only do this by accident before this patch by the virtue of ret
always being zero when there are no errors. Make this explicit
rather than needing to specifically zero ret in the ENOSPC retry
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_file.c  |   13 +++++----
 fs/xfs/xfs_inode.h |   10 +++++++
 fs/xfs/xfs_iomap.c |   23 +++++-----------
 fs/xfs/xfs_mount.h |    1 -
 fs/xfs/xfs_super.c |    3 --
 fs/xfs/xfs_sync.c  |   78 ----------------------------------------------------
 fs/xfs/xfs_sync.h  |    3 --
 7 files changed, 24 insertions(+), 107 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1eaeb8b..2cadcf7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -728,16 +728,17 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, &iocb->ki_pos, count, ret);
+			pos, &iocb->ki_pos, count, 0);
+
 	/*
-	 * if we just got an ENOSPC, flush the inode now we aren't holding any
-	 * page locks and retry *once*
+	 * If we just got an ENOSPC, try to write back all dirty inodes to
+	 * convert delalloc space to free up some of the excess reserved
+	 * metadata space.
 	 */
 	if (ret == -ENOSPC && !enospc) {
 		enospc = 1;
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (!ret)
-			goto write_retry;
+		xfs_flush_inodes(ip);
+		goto write_retry;
 	}
 
 	current->backing_dev_info = NULL;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f9..a12fe18 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -288,6 +288,16 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 }
 
 /*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock.
+ */
+static inline void
+xfs_flush_inodes(struct xfs_inode *ip)
+{
+	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
+}
+
+/*
  * i_flags helper functions
  */
 static inline void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6..f858b90 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -373,7 +373,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, flushed = 0;
+	int		prealloc;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,26 +434,17 @@ retry:
 	}
 
 	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
-	 * some of the excess reserved metadata space. For both cases, retry
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
 	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (flushed)
-			return XFS_ERROR(error ? error : ENOSPC);
-
-		if (error == ENOSPC) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (prealloc) {
+			prealloc = 0;
+			error = 0;
+			goto retry;
 		}
-
-		flushed = 1;
-		error = 0;
-		prealloc = 0;
-		goto retry;
+		return XFS_ERROR(error ? error : ENOSPC);
 	}
 
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 26e46ae..a54b5aa 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -198,7 +198,6 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b85ca2d..fed1eb2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -918,8 +918,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_work_sync(&mp->m_flush_work);
-
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 
@@ -1231,7 +1229,6 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 7527610..6a2ada3 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -217,51 +217,6 @@ xfs_inode_ag_iterator(
 }
 
 STATIC int
-xfs_sync_inode_data(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			flags)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct address_space *mapping = inode->i_mapping;
-	int			error = 0;
-
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-		if (flags & SYNC_TRYLOCK)
-			return 0;
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XBF_ASYNC, FI_NONE);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-	struct xfs_mount	*mp,
-	int			flags)
-{
-	int			error;
-
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-	if (error)
-		return XFS_ERROR(error);
-
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return 0;
-}
-
-STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
@@ -415,39 +370,6 @@ xfs_reclaim_worker(
 	xfs_syncd_queue_reclaim(mp);
 }
 
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	queue_work(xfs_syncd_wq, &mp->m_flush_work);
-	flush_work_sync(&mp->m_flush_work);
-}
-
-void
-xfs_flush_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(work,
-					struct xfs_mount, m_flush_work);
-
-	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 8d58fab..0018e84 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,14 +26,11 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inodes(struct xfs_inode *ip);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 07/13] xfs: syncd workqueue is no more
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (5 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28 12:35   ` Christoph Hellwig
  2012-09-28 18:17   ` Mark Tinguely
  2012-09-28  4:44 ` [PATCH 08/13] xfs: xfs_sync_fsdata is redundant Dave Chinner
                   ` (5 subsequent siblings)
  12 siblings, 2 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

With the syncd functions moved to the log and/or removed, the syncd
workqueue is the only remaining bit left. It is used by the log
covering/ail pushing work, as well as by the inode reclaim work.

Given how cheap workqueues are these days, give the log and inode
reclaim work their own work queues and kill the syncd work queue.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log.c   |    2 +-
 fs/xfs/xfs_mount.h |    2 ++
 fs/xfs/xfs_super.c |   38 ++++++++++++++++++--------------------
 fs/xfs/xfs_sync.c  |   20 +++++++++-----------
 fs/xfs/xfs_sync.h  |    2 --
 5 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a400e71..96ea9ed 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1191,7 +1191,7 @@ void
 xfs_log_work_queue(
 	struct xfs_mount        *mp)
 {
-	queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work,
+	queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
 				msecs_to_jiffies(xfs_syncd_centisecs * 10));
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a54b5aa..7c417b6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -207,6 +207,8 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_data_workqueue;
 	struct workqueue_struct	*m_unwritten_workqueue;
 	struct workqueue_struct	*m_cil_workqueue;
+	struct workqueue_struct	*m_reclaim_workqueue;
+	struct workqueue_struct	*m_log_workqueue;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fed1eb2..65cf42c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -776,8 +776,23 @@ xfs_init_mount_workqueues(
 			WQ_MEM_RECLAIM, 0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
+
+	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+			WQ_NON_REENTRANT, 0, mp->m_fsname);
+	if (!mp->m_reclaim_workqueue)
+		goto out_destroy_cil;
+
+	mp->m_log_workqueue = alloc_workqueue("xfs-reclaim/%s",
+			WQ_NON_REENTRANT, 0, mp->m_fsname);
+	if (!mp->m_log_workqueue)
+		goto out_destroy_reclaim;
+
 	return 0;
 
+out_destroy_reclaim:
+	destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+	destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
 	destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
@@ -790,6 +805,8 @@ STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_log_workqueue);
+	destroy_workqueue(mp->m_reclaim_workqueue);
 	destroy_workqueue(mp->m_cil_workqueue);
 	destroy_workqueue(mp->m_data_workqueue);
 	destroy_workqueue(mp->m_unwritten_workqueue);
@@ -1280,10 +1297,6 @@ xfs_fs_fill_super(
 	/*
 	 * we must configure the block size in the superblock before we run the
 	 * full mount process as the mount process can lookup and cache inodes.
-	 * For the same reason we must also initialise the syncd and register
-	 * the inode cache shrinker so that inodes can be reclaimed during
-	 * operations like a quotacheck that iterate all inodes in the
-	 * filesystem.
 	 */
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1523,16 +1536,6 @@ STATIC int __init
 xfs_init_workqueues(void)
 {
 	/*
-	 * We never want to the same work item to run twice, reclaiming inodes
-	 * or idling the log is not going to get any faster by multiple CPUs
-	 * competing for ressources.  Use the default large max_active value
-	 * so that even lots of filesystems can perform these task in parallel.
-	 */
-	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-	if (!xfs_syncd_wq)
-		return -ENOMEM;
-
-	/*
 	 * The allocation workqueue can be used in memory reclaim situations
 	 * (writepage path), and parallelism is only limited by the number of
 	 * AGs in all the filesystems mounted. Hence use the default large
@@ -1540,20 +1543,15 @@ xfs_init_workqueues(void)
 	 */
 	xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
 	if (!xfs_alloc_wq)
-		goto out_destroy_syncd;
+		return -ENOMEM;
 
 	return 0;
-
-out_destroy_syncd:
-	destroy_workqueue(xfs_syncd_wq);
-	return -ENOMEM;
 }
 
 STATIC void
 xfs_destroy_workqueues(void)
 {
 	destroy_workqueue(xfs_alloc_wq);
-	destroy_workqueue(xfs_syncd_wq);
 }
 
 STATIC int __init
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 6a2ada3..15be21f 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -40,8 +40,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
-
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
  * radix tree lookups to a minimum. The batch size is a trade off between
@@ -335,18 +333,18 @@ xfs_quiesce_attr(
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
  * tunable, but that can be done if this method proves to be ineffective or too
  * aggressive.
  */
 static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
 	struct xfs_mount        *mp)
 {
 
 	rcu_read_lock();
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 	}
 	rcu_read_unlock();
@@ -367,7 +365,7 @@ xfs_reclaim_worker(
 					struct xfs_mount, m_reclaim_work);
 
 	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_work_queue(mp);
 }
 
 void
@@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag(
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
 		/* schedule periodic background inode reclaim */
-		xfs_syncd_queue_reclaim(ip->i_mount);
+		xfs_reclaim_work_queue(ip->i_mount);
 
 		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 							-1, _RET_IP_);
@@ -646,9 +644,9 @@ out:
 	/*
 	 * We could return EAGAIN here to make reclaim rescan the inode tree in
 	 * a short while. However, this just burns CPU time scanning the tree
-	 * waiting for IO to complete and xfssyncd never goes back to the idle
-	 * state. Instead, return 0 to let the next scheduled background reclaim
-	 * attempt to reclaim the inode again.
+	 * waiting for IO to complete and the reclaim work never goes back to
+	 * the idle state. Instead, return 0 to let the next scheduled
+	 * background reclaim attempt to reclaim the inode again.
 	 */
 	return 0;
 }
@@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr(
 	int			nr_to_scan)
 {
 	/* kick background reclaimer and push the AIL */
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_work_queue(mp);
 	xfs_ail_push_all(mp->m_ail);
 
 	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 0018e84..0beabea 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -24,8 +24,6 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
-extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
-
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 08/13] xfs: xfs_sync_fsdata is redundant
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (6 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 07/13] xfs: syncd workqueue is no more Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Why do we need to write the superblock to disk once we've written
all the data?  We don't actually - the reasons for doing this are
lost in the mists of time, and go back to the way Irix used to drive
VFS flushing.

On linux, this code is only called from two contexts: remount and
.sync_fs. In the remount case, the call is followed by a metadata
sync, which unpins and writes the superblock.  In the sync_fs case,
we only need to force the log to disk to ensure that the superblock
is correctly on disk, so we don't actually need to write it. Hence
the functionality is either redundant or superfluous and thus can be
removed.

Seeing as xfs_quiesce_data is essentially now just a log force,
remove it as well and fold the code back into the two callers.
Neither of them need the log covering check, either, as that is
redundant for the remount case, and unnecessary for the .sync_fs
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_super.c |   19 +++++----------
 fs/xfs/xfs_sync.c  |   67 +++++++---------------------------------------------
 2 files changed, 14 insertions(+), 72 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 65cf42c..1ee7b09 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -952,7 +952,6 @@ xfs_fs_sync_fs(
 	int			wait)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
-	int			error;
 
 	/*
 	 * Doing anything during the async pass would be counterproductive.
@@ -960,10 +959,7 @@ xfs_fs_sync_fs(
 	if (!wait)
 		return 0;
 
-	error = xfs_quiesce_data(mp);
-	if (error)
-		return -error;
-
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
@@ -1127,15 +1123,12 @@ xfs_fs_remount(
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
 		/*
-		 * After we have synced the data but before we sync the
-		 * metadata, we need to free up the reserve block pool so that
-		 * the used block count in the superblock on disk is correct at
-		 * the end of the remount. Stash the current reserve pool size
-		 * so that if we get remounted rw, we can return it to the same
-		 * size.
+		 * Before we sync the metadata, we need to free up the reserve
+		 * block pool so that the used block count in the superblock on
+		 * disk is correct at the end of the remount. Stash the current
+		 * reserve pool size so that if we get remounted rw, we can
+		 * return it to the same size.
 		 */
-
-		xfs_quiesce_data(mp);
 		xfs_save_resvblks(mp);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 15be21f..581eb59 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -214,70 +214,16 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-STATIC int
-xfs_sync_fsdata(
-	struct xfs_mount	*mp)
-{
-	struct xfs_buf		*bp;
-	int			error;
-
-	/*
-	 * If the buffer is pinned then push on the log so we won't get stuck
-	 * waiting in the write for someone, maybe ourselves, to flush the log.
-	 *
-	 * Even though we just pushed the log above, we did not have the
-	 * superblock buffer locked at that point so it can become pinned in
-	 * between there and here.
-	 */
-	bp = xfs_getsb(mp, 0);
-	if (xfs_buf_ispinned(bp))
-		xfs_log_force(mp, 0);
-	error = xfs_bwrite(bp);
-	xfs_buf_relse(bp);
-	return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
-int
-xfs_quiesce_data(
-	struct xfs_mount	*mp)
-{
-	int			error, error2 = 0;
-
-	/* force out the log */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* write superblock and hoover up shutdown errors */
-	error = xfs_sync_fsdata(mp);
-
-	/* mark the log as covered if needed */
-	if (xfs_log_need_covered(mp))
-		error2 = xfs_fs_log_dummy(mp);
-
-	return error ? error : error2;
-}
-
 /*
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
  *
+ * The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ *
  * Note: this stops background sync work - the callers must ensure it is started
  * again when appropriate.
  */
@@ -291,6 +237,9 @@ xfs_quiesce_attr(
 	while (atomic_read(&mp->m_active_trans) > 0)
 		delay(100);
 
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
 	/* reclaim inodes to do any IO before the freeze completes */
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (7 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 08/13] xfs: xfs_sync_fsdata is redundant Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's
nothing really sync-specific about this functionality so it doesn't
really matter where it lives. Move it to benext to it's callers, so
all the remount/sync_fs code is in the one place.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_super.c |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_sync.c  |   65 --------------------------------------------------
 fs/xfs/xfs_sync.h  |    3 ---
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1ee7b09..4806e46 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1043,6 +1043,73 @@ xfs_restore_resvblks(struct xfs_mount *mp)
 	xfs_reserve_blocks(mp, &resblks, NULL);
 }
 
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk (this is the main
+ * difference between a sync and a quiesce).
+ *
+ * This shoul deffectively mimic the code in xfs_unmountfs() and
+ * xfs_log_umount() but without tearing down any structures.
+ * XXX: bug fixes needed!
+ *
+ * Note: this stops background log work - the callers must ensure it is started
+ * again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+	struct xfs_mount	*mp)
+{
+	int	error = 0;
+
+	/* wait for all modifications to complete */
+	while (atomic_read(&mp->m_active_trans) > 0)
+		delay(100);
+
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/* reclaim inodes to do any IO before the freeze completes */
+	xfs_reclaim_inodes(mp, 0);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	/* flush all pending changes from the AIL */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
+
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+	/* Push the superblock and write an unmount record */
+	error = xfs_log_sbcount(mp);
+	if (error)
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
+	xfs_log_unmount_write(mp);
+
+	/*
+	 * At this point we might have modified the superblock again and thus
+	 * added an item to the AIL, thus flush it again.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/*
+	 * The superblock buffer is uncached and xfsaild_push() will lock and
+	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
+	 * here but a lock on the superblock buffer will block until iodone()
+	 * has completed.
+	 */
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 581eb59..7b63028 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -215,71 +215,6 @@ xfs_inode_ag_iterator(
 }
 
 /*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- *
- * The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- *
- * Note: this stops background sync work - the callers must ensure it is started
- * again when appropriate.
- */
-void
-xfs_quiesce_attr(
-	struct xfs_mount	*mp)
-{
-	int	error = 0;
-
-	/* wait for all modifications to complete */
-	while (atomic_read(&mp->m_active_trans) > 0)
-		delay(100);
-
-	/* force the log to unpin objects from the now complete transactions */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* reclaim inodes to do any IO before the freeze completes */
-	xfs_reclaim_inodes(mp, 0);
-	xfs_reclaim_inodes(mp, SYNC_WAIT);
-
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
-	/* Push the superblock and write an unmount record */
-	error = xfs_log_sbcount(mp);
-	if (error)
-		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-}
-
-/*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
  * on the xfs periodic sync default of 30s. Perhaps this should have it's own
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 0beabea..0ba9c89 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,9 +26,6 @@ struct xfs_perag;
 
 void xfs_reclaim_worker(struct work_struct *work);
 
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (8 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_quiesce_attr() is supposed to leave the log empty with an
unmount record written. Right now it does not wait for the AIL to be
emptied before writing the unmount record, not does it wait for
metadata IO completion, either. Fix it to use the same method and
code as xfs_log_unmount().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_log.c   |   25 ++++++++++++++++++-------
 fs/xfs/xfs_log.h   |    1 +
 fs/xfs/xfs_super.c |   41 ++++++++---------------------------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 96ea9ed..67d63f6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -853,20 +853,17 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Shut down and release the AIL and Log.
- *
- * During unmount, we need to ensure we flush all the dirty metadata objects
- * from the AIL so that the log is empty before we write the unmount record to
- * the log.
+ * Empty the log for unmount/freeze.
  *
  * To do this, we first need to shut down the background log work so it is not
  * trying to cover the log as we clean up. We then need to unpin all objects in
  * the log so we can then flush them out. Once they have completed their IO and
  * run the callbacks removing themselves from the AIL, we can write the unmount
- * record, tear down the AIL and finally free the log.
+ * record.
  */
 void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_quiesce(
+	struct xfs_mount	*mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_log_force(mp, XFS_LOG_SYNC);
@@ -884,6 +881,20 @@ xfs_log_unmount(xfs_mount_t *mp)
 	xfs_buf_unlock(mp->m_sb_bp);
 
 	xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+	struct xfs_mount	*mp)
+{
+	xfs_log_quiesce(mp);
 
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 26ed7de..5caee96 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -183,6 +183,7 @@ bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void	xfs_log_work_queue(struct xfs_mount *mp);
 void	xfs_log_worker(struct work_struct *work);
+void	xfs_log_quiesce(struct xfs_mount *mp);
 
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4806e46..81de924 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1048,15 +1048,11 @@ xfs_restore_resvblks(struct xfs_mount *mp)
  *
  * This ensures that the metadata is written to their location on disk rather
  * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk (this is the main
- * difference between a sync and a quiesce).
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
  *
- * This shoul deffectively mimic the code in xfs_unmountfs() and
- * xfs_log_umount() but without tearing down any structures.
- * XXX: bug fixes needed!
- *
- * Note: this stops background log work - the callers must ensure it is started
- * again when appropriate.
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -1075,39 +1071,18 @@ xfs_quiesce_attr(
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp);
 	if (error)
 		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
 				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
 	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
 	 */
-	xfs_ail_push_all_sync(mp->m_ail);
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
+	xfs_log_quiesce(mp);
 }
 
 STATIC int
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch]
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (9 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
  2012-09-28  4:44 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_sync.c now only contains inode reclaim functions and inode cache
iteration functions. It is not related to sync operations anymore.
Rename to xfs_icache.c to reflect it's contents and prepare for
consolidation with the other inode cache file that exists
(xfs_iget.c).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/Makefile          |    2 +-
 fs/xfs/xfs_icache.c      |  715 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h      |   43 +++
 fs/xfs/xfs_iget.c        |    1 +
 fs/xfs/xfs_mount.c       |    1 +
 fs/xfs/xfs_mount.h       |    2 -
 fs/xfs/xfs_qm_syscalls.c |    1 +
 fs/xfs/xfs_super.c       |    2 +-
 fs/xfs/xfs_sync.c        |  714 ---------------------------------------------
 fs/xfs/xfs_sync.h        |   43 ---
 10 files changed, 763 insertions(+), 761 deletions(-)
 create mode 100644 fs/xfs/xfs_icache.c
 create mode 100644 fs/xfs/xfs_icache.h
 delete mode 100644 fs/xfs/xfs_sync.c
 delete mode 100644 fs/xfs/xfs_sync.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974..442f256 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -39,6 +39,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fsops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
+				   xfs_icache.o \
 				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
@@ -47,7 +48,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mru_cache.o \
 				   xfs_super.o \
-				   xfs_sync.o \
 				   xfs_xattr.o \
 				   xfs_rename.o \
 				   xfs_utils.o \
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 0000000..eba216f
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	/* inode is valid */
+	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	int			done;
+	int			nr_found;
+
+restart:
+	done = 0;
+	skipped = 0;
+	first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == EFSCORRUPTED)
+			break;
+
+		cond_resched();
+
+	} while (nr_found && !done);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_reclaim_work_queue(mp);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* schedule periodic background inode reclaim */
+		xfs_reclaim_work_queue(ip->i_mount);
+
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+	__xfs_inode_set_reclaim_tag(pag, ip);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
+
+	/*
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+		return 1;
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, async		-		requeue
+ *	dirty, sync		0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, async	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, async	=> requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			sync_mode)
+{
+	struct xfs_buf		*bp = NULL;
+	int			error;
+
+restart:
+	error = 0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
+
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_abort(ip, false);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out_ifunlock;
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/*
+	 * Never flush out dirty data during non-blocking reclaim, as it would
+	 * just contend with AIL pushing trying to do the same job.
+	 */
+	if (!(sync_mode & SYNC_WAIT))
+		goto out_ifunlock;
+
+	/*
+	 * Now we have an inode that needs flushing.
+	 *
+	 * Note that xfs_iflush will never block on the inode buffer lock, as
+	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
+	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
+	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+	 * result in an ABBA deadlock with xfs_ifree_cluster().
+	 *
+	 * As xfs_ifree_cluser() must gather all inodes that are active in the
+	 * cache to mark them stale, if we hit this case we don't actually want
+	 * to do IO here - we want the inode marked stale so we can simply
+	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+	 * inode, back off and try again.  Hopefully the next pass through will
+	 * see the stale flag set on the inode.
+	 */
+	error = xfs_iflush(ip, &bp);
+	if (error == EAGAIN) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/* backoff longer than in xfs_ifree_cluster */
+		delay(2);
+		goto restart;
+	}
+
+	if (!error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+	}
+
+	xfs_iflock(ip);
+reclaim:
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_inode_free(ip);
+	return error;
+
+out_ifunlock:
+	xfs_ifunlock(ip);
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and the reclaim work never goes back to
+	 * the idle state. Instead, return 0 to let the next scheduled
+	 * background reclaim attempt to reclaim the inode again.
+	 */
+	return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				xfs_perag_put(pag);
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			rcu_read_lock();
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				done = 1;
+				rcu_read_unlock();
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
+				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			rcu_read_unlock();
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+			cond_resched();
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+	xfs_mount_t	*mp,
+	int		mode)
+{
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+	struct xfs_mount	*mp,
+	int			nr_to_scan)
+{
+	/* kick background reclaimer and push the AIL */
+	xfs_reclaim_work_queue(mp);
+	xfs_ail_push_all(mp->m_ail);
+
+	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag = 0;
+	int			reclaimable = 0;
+
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
new file mode 100644
index 0000000..0ba9c89
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+				struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+	int flags);
+
+#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 784a803..069c5ce 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,6 +38,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 149b601..7dfbf07 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 #ifdef HAVE_PERCPU_SB
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7c417b6..a631ca3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
 
 #else /* __KERNEL__ */
 
-#include "xfs_sync.h"
-
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b1..7a9071f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 81de924..ce52520 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
 #include "xfs_trace.h"
 
 #include <linux/namei.h>
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
deleted file mode 100644
index 7b63028..0000000
--- a/fs/xfs/xfs_sync.c
+++ /dev/null
@@ -1,714 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dinode.h"
-#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_fsops.h"
-
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH	32
-
-STATIC int
-xfs_inode_ag_walk_grab(
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	ASSERT(rcu_read_lock_held());
-
-	/*
-	 * check for stale RCU freed inode
-	 *
-	 * If the inode has been reallocated, it doesn't matter if it's not in
-	 * the AG we are walking - we are walking for writeback, so if it
-	 * passes all the "valid inode" checks and is dirty, then we'll write
-	 * it back anyway.  If it has been reallocated and still being
-	 * initialised, the XFS_INEW check below will catch it.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!ip->i_ino)
-		goto out_unlock_noent;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock_noent;
-	spin_unlock(&ip->i_flags_lock);
-
-	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EFSCORRUPTED;
-
-	/* If we can't grab the inode, it must on it's way to reclaim. */
-	if (!igrab(inode))
-		return ENOENT;
-
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		return ENOENT;
-	}
-
-	/* inode is valid */
-	return 0;
-
-out_unlock_noent:
-	spin_unlock(&ip->i_flags_lock);
-	return ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	uint32_t		first_index;
-	int			last_error = 0;
-	int			skipped;
-	int			done;
-	int			nr_found;
-
-restart:
-	done = 0;
-	skipped = 0;
-	first_index = 0;
-	nr_found = 0;
-	do {
-		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-		int		error = 0;
-		int		i;
-
-		rcu_read_lock();
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH);
-		if (!nr_found) {
-			rcu_read_unlock();
-			break;
-		}
-
-		/*
-		 * Grab the inodes before we drop the lock. if we found
-		 * nothing, nr == 0 and the loop will be skipped.
-		 */
-		for (i = 0; i < nr_found; i++) {
-			struct xfs_inode *ip = batch[i];
-
-			if (done || xfs_inode_ag_walk_grab(ip))
-				batch[i] = NULL;
-
-			/*
-			 * Update the index for the next lookup. Catch
-			 * overflows into the next AG range which can occur if
-			 * we have inodes in the last block of the AG and we
-			 * are currently pointing to the last inode.
-			 *
-			 * Because we may see inodes that are from the wrong AG
-			 * due to RCU freeing and reallocation, only update the
-			 * index if it lies in this AG. It was a race that lead
-			 * us to see this inode, so another lookup from the
-			 * same index will not find it again.
-			 */
-			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-				continue;
-			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-				done = 1;
-		}
-
-		/* unlock now we've grabbed the inodes. */
-		rcu_read_unlock();
-
-		for (i = 0; i < nr_found; i++) {
-			if (!batch[i])
-				continue;
-			error = execute(batch[i], pag, flags);
-			IRELE(batch[i]);
-			if (error == EAGAIN) {
-				skipped++;
-				continue;
-			}
-			if (error && last_error != EFSCORRUPTED)
-				last_error = error;
-		}
-
-		/* bail out if the filesystem is corrupted.  */
-		if (error == EFSCORRUPTED)
-			break;
-
-		cond_resched();
-
-	} while (nr_found && !done);
-
-	if (skipped) {
-		delay(1);
-		goto restart;
-	}
-	return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
-	struct xfs_mount	*mp,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-
-	ag = 0;
-	while ((pag = xfs_perag_get(mp, ag))) {
-		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags);
-		xfs_perag_put(pag);
-		if (error) {
-			last_error = error;
-			if (error == EFSCORRUPTED)
-				break;
-		}
-	}
-	return XFS_ERROR(last_error);
-}
-
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-	struct xfs_mount        *mp)
-{
-
-	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-	}
-	rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_reclaim_work);
-
-	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_reclaim_work_queue(mp);
-}
-
-void
-__xfs_inode_set_reclaim_tag(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
-{
-	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			   XFS_ICI_RECLAIM_TAG);
-
-	if (!pag->pag_ici_reclaimable) {
-		/* propagate the reclaim tag up into the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-
-		/* schedule periodic background inode reclaim */
-		xfs_reclaim_work_queue(ip->i_mount);
-
-		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-	pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-	xfs_inode_t	*ip)
-{
-	struct xfs_mount *mp = ip->i_mount;
-	struct xfs_perag *pag;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	spin_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_set_reclaim_tag(pag, ip);
-	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-	spin_unlock(&ip->i_flags_lock);
-	spin_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	pag->pag_ici_reclaimable--;
-	if (!pag->pag_ici_reclaimable) {
-		/* clear the reclaim tag from the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-}
-
-void
-__xfs_inode_clear_reclaim_tag(
-	xfs_mount_t	*mp,
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	radix_tree_tag_clear(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-	__xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
- * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
- */
-STATIC int
-xfs_reclaim_inode_grab(
-	struct xfs_inode	*ip,
-	int			flags)
-{
-	ASSERT(rcu_read_lock_held());
-
-	/* quick check for stale RCU freed inode */
-	if (!ip->i_ino)
-		return 1;
-
-	/*
-	 * If we are asked for non-blocking operation, do unlocked checks to
-	 * see if the inode already is being flushed or in reclaim to avoid
-	 * lock traffic.
-	 */
-	if ((flags & SYNC_TRYLOCK) &&
-	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
-		return 1;
-
-	/*
-	 * The radix tree lock here protects a thread in xfs_iget from racing
-	 * with us starting reclaim on the inode.  Once we have the
-	 * XFS_IRECLAIM flag set it will not touch us.
-	 *
-	 * Due to RCU lookup, we may find inodes that have been freed and only
-	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-	 * aren't candidates for reclaim at all, so we must check the
-	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* not a reclaim candidate. */
-		spin_unlock(&ip->i_flags_lock);
-		return 1;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	return 0;
-}
-
-/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- *	inode state	     iflush ret		required action
- *      ---------------      ----------         ---------------
- *	bad			-		reclaim
- *	shutdown		EIO		unpin and reclaim
- *	clean, unpinned		0		reclaim
- *	stale, unpinned		0		reclaim
- *	clean, pinned(*)	0		requeue
- *	stale, pinned		EAGAIN		requeue
- *	dirty, async		-		requeue
- *	dirty, sync		0		reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting.  For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- *	bad		=> reclaim
- *	shutdown	=> unpin and reclaim
- *	pinned, async	=> requeue
- *	pinned, sync	=> unpin
- *	stale		=> reclaim
- *	clean		=> reclaim
- *	dirty, async	=> requeue
- *	dirty, sync	=> flush, wait and reclaim
- */
-STATIC int
-xfs_reclaim_inode(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			sync_mode)
-{
-	struct xfs_buf		*bp = NULL;
-	int			error;
-
-restart:
-	error = 0;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (!xfs_iflock_nowait(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out;
-		xfs_iflock(ip);
-	}
-
-	if (is_bad_inode(VFS_I(ip)))
-		goto reclaim;
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		xfs_iunpin_wait(ip);
-		xfs_iflush_abort(ip, false);
-		goto reclaim;
-	}
-	if (xfs_ipincount(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out_ifunlock;
-		xfs_iunpin_wait(ip);
-	}
-	if (xfs_iflags_test(ip, XFS_ISTALE))
-		goto reclaim;
-	if (xfs_inode_clean(ip))
-		goto reclaim;
-
-	/*
-	 * Never flush out dirty data during non-blocking reclaim, as it would
-	 * just contend with AIL pushing trying to do the same job.
-	 */
-	if (!(sync_mode & SYNC_WAIT))
-		goto out_ifunlock;
-
-	/*
-	 * Now we have an inode that needs flushing.
-	 *
-	 * Note that xfs_iflush will never block on the inode buffer lock, as
-	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
-	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
-	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
-	 * result in an ABBA deadlock with xfs_ifree_cluster().
-	 *
-	 * As xfs_ifree_cluser() must gather all inodes that are active in the
-	 * cache to mark them stale, if we hit this case we don't actually want
-	 * to do IO here - we want the inode marked stale so we can simply
-	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
-	 * inode, back off and try again.  Hopefully the next pass through will
-	 * see the stale flag set on the inode.
-	 */
-	error = xfs_iflush(ip, &bp);
-	if (error == EAGAIN) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		/* backoff longer than in xfs_ifree_cluster */
-		delay(2);
-		goto restart;
-	}
-
-	if (!error) {
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-	}
-
-	xfs_iflock(ip);
-reclaim:
-	xfs_ifunlock(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	XFS_STATS_INC(xs_ig_reclaims);
-	/*
-	 * Remove the inode from the per-AG radix tree.
-	 *
-	 * Because radix_tree_delete won't complain even if the item was never
-	 * added to the tree assert that it's been there before to catch
-	 * problems with the inode life time early on.
-	 */
-	spin_lock(&pag->pag_ici_lock);
-	if (!radix_tree_delete(&pag->pag_ici_root,
-				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
-		ASSERT(0);
-	__xfs_inode_clear_reclaim(pag, ip);
-	spin_unlock(&pag->pag_ici_lock);
-
-	/*
-	 * Here we do an (almost) spurious inode lock in order to coordinate
-	 * with inode cache radix tree lookups.  This is because the lookup
-	 * can reference the inodes in the cache without taking references.
-	 *
-	 * We make that OK here by ensuring that we wait until the inode is
-	 * unlocked after the lookup before we go ahead and free it.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_qm_dqdetach(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	xfs_inode_free(ip);
-	return error;
-
-out_ifunlock:
-	xfs_ifunlock(ip);
-out:
-	xfs_iflags_clear(ip, XFS_IRECLAIM);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	/*
-	 * We could return EAGAIN here to make reclaim rescan the inode tree in
-	 * a short while. However, this just burns CPU time scanning the tree
-	 * waiting for IO to complete and the reclaim work never goes back to
-	 * the idle state. Instead, return 0 to let the next scheduled
-	 * background reclaim attempt to reclaim the inode again.
-	 */
-	return 0;
-}
-
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-int
-xfs_reclaim_inodes_ag(
-	struct xfs_mount	*mp,
-	int			flags,
-	int			*nr_to_scan)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-	int			trylock = flags & SYNC_TRYLOCK;
-	int			skipped;
-
-restart:
-	ag = 0;
-	skipped = 0;
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		unsigned long	first_index = 0;
-		int		done = 0;
-		int		nr_found = 0;
-
-		ag = pag->pag_agno + 1;
-
-		if (trylock) {
-			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-				skipped++;
-				xfs_perag_put(pag);
-				continue;
-			}
-			first_index = pag->pag_ici_reclaim_cursor;
-		} else
-			mutex_lock(&pag->pag_ici_reclaim_lock);
-
-		do {
-			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-			int	i;
-
-			rcu_read_lock();
-			nr_found = radix_tree_gang_lookup_tag(
-					&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH,
-					XFS_ICI_RECLAIM_TAG);
-			if (!nr_found) {
-				done = 1;
-				rcu_read_unlock();
-				break;
-			}
-
-			/*
-			 * Grab the inodes before we drop the lock. if we found
-			 * nothing, nr == 0 and the loop will be skipped.
-			 */
-			for (i = 0; i < nr_found; i++) {
-				struct xfs_inode *ip = batch[i];
-
-				if (done || xfs_reclaim_inode_grab(ip, flags))
-					batch[i] = NULL;
-
-				/*
-				 * Update the index for the next lookup. Catch
-				 * overflows into the next AG range which can
-				 * occur if we have inodes in the last block of
-				 * the AG and we are currently pointing to the
-				 * last inode.
-				 *
-				 * Because we may see inodes that are from the
-				 * wrong AG due to RCU freeing and
-				 * reallocation, only update the index if it
-				 * lies in this AG. It was a race that lead us
-				 * to see this inode, so another lookup from
-				 * the same index will not find it again.
-				 */
-				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-								pag->pag_agno)
-					continue;
-				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-					done = 1;
-			}
-
-			/* unlock now we've grabbed the inodes. */
-			rcu_read_unlock();
-
-			for (i = 0; i < nr_found; i++) {
-				if (!batch[i])
-					continue;
-				error = xfs_reclaim_inode(batch[i], pag, flags);
-				if (error && last_error != EFSCORRUPTED)
-					last_error = error;
-			}
-
-			*nr_to_scan -= XFS_LOOKUP_BATCH;
-
-			cond_resched();
-
-		} while (nr_found && !done && *nr_to_scan > 0);
-
-		if (trylock && !done)
-			pag->pag_ici_reclaim_cursor = first_index;
-		else
-			pag->pag_ici_reclaim_cursor = 0;
-		mutex_unlock(&pag->pag_ici_reclaim_lock);
-		xfs_perag_put(pag);
-	}
-
-	/*
-	 * if we skipped any AG, and we still have scan count remaining, do
-	 * another pass this time using blocking reclaim semantics (i.e
-	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
-	 * ensure that when we get more reclaimers than AGs we block rather
-	 * than spin trying to execute reclaim.
-	 */
-	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-		trylock = 0;
-		goto restart;
-	}
-	return XFS_ERROR(last_error);
-}
-
-int
-xfs_reclaim_inodes(
-	xfs_mount_t	*mp,
-	int		mode)
-{
-	int		nr_to_scan = INT_MAX;
-
-	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
-}
-
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-void
-xfs_reclaim_inodes_nr(
-	struct xfs_mount	*mp,
-	int			nr_to_scan)
-{
-	/* kick background reclaimer and push the AIL */
-	xfs_reclaim_work_queue(mp);
-	xfs_ail_push_all(mp->m_ail);
-
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
-}
-
-/*
- * Return the number of reclaimable inodes in the filesystem for
- * the shrinker to determine how much to reclaim.
- */
-int
-xfs_reclaim_inodes_count(
-	struct xfs_mount	*mp)
-{
-	struct xfs_perag	*pag;
-	xfs_agnumber_t		ag = 0;
-	int			reclaimable = 0;
-
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		ag = pag->pag_agno + 1;
-		reclaimable += pag->pag_ici_reclaimable;
-		xfs_perag_put(pag);
-	}
-	return reclaimable;
-}
-
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
deleted file mode 100644
index 0ba9c89..0000000
--- a/fs/xfs/xfs_sync.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef XFS_SYNC_H
-#define XFS_SYNC_H 1
-
-struct xfs_mount;
-struct xfs_perag;
-
-#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
-#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
-
-void xfs_reclaim_worker(struct work_struct *work);
-
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
-
-int xfs_sync_inode_grab(struct xfs_inode *ip);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
-	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags);
-
-#endif
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (10 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  2012-09-28  4:44 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

xfs_ilock() and friends really aren't related to the inode cache in
any way, so move them to xfs_inode.c with all the other inode
related functionality.

While doing this move, move the xfs_ilock() tracepoints to *before*
the lock is taken so that when a hang on a lock occurs we have
events to indicate which process and what inode we were trying to
lock when the hang occurred. This is much better than the current
silence we get on a hang...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/xfs_iget.c  |  251 ----------------------------------------------------
 fs/xfs/xfs_inode.c |  250 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+), 251 deletions(-)

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 069c5ce..ea9a5fa 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -453,254 +453,3 @@ out_error_or_again:
 	return error;
 }
 
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-	xfs_inode_t	*ip)
-{
-	uint	lock_mode;
-
-	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-		lock_mode = XFS_ILOCK_EXCL;
-	} else {
-		lock_mode = XFS_ILOCK_SHARED;
-	}
-
-	xfs_ilock(ip, lock_mode);
-
-	return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-	xfs_inode_t	*ip,
-	unsigned int	lock_mode)
-{
-	xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *		XFS_IOLOCK_SHARED,
- *		XFS_IOLOCK_EXCL,
- *		XFS_ILOCK_SHARED,
- *		XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
-	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *	 of valid values.
- */
-int
-xfs_ilock_nowait(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
-			goto out;
-	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
-			goto out;
-	}
-	if (lock_flags & XFS_ILOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_lock))
-			goto out_undo_iolock;
-	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_lock))
-			goto out_undo_iolock;
-	}
-	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-	return 1;
-
- out_undo_iolock:
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
- out:
-	return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *	 of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-	ASSERT(lock_flags != 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrunlock_excl(&ip->i_lock);
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mrunlock_shared(&ip->i_lock);
-
-	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrdemote(&ip->i_lock);
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
-
-	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-		if (!(lock_flags & XFS_ILOCK_SHARED))
-			return !!ip->i_lock.mr_writer;
-		return rwsem_is_locked(&ip->i_lock.mr_lock);
-	}
-
-	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
-	}
-
-	ASSERT(0);
-	return 0;
-}
-#endif
-
-void
-__xfs_iflock(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (xfs_isiflocked(ip))
-			io_schedule();
-	} while (!xfs_iflock_nowait(ip));
-
-	finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258..ba404e4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -74,6 +74,256 @@ xfs_get_extsz_hint(
 	return 0;
 }
 
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_iolock;
+	}
+	return 1;
+
+ out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+ out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+	}
+
+	ASSERT(0);
+	return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
+
 #ifdef DEBUG
 /*
  * Make sure that the extents in the given memory buffer
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 13/13] xfs: remove xfs_iget.c
  2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
                   ` (11 preceding siblings ...)
  2012-09-28  4:44 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
@ 2012-09-28  4:44 ` Dave Chinner
  12 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-09-28  4:44 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c
along with the other inode cache functions. This removes all functionality from
xfs_iget.c, so the file can simply be removed.

This move results in various functions now only having the scope of a single
file (e.g. xfs_inode_free()), so clean up all the definitions and exported
prototypes in xfs_icache.[ch] and xfs_inode.h appropriately.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
---
 fs/xfs/Makefile          |    1 -
 fs/xfs/xfs_export.c      |    1 +
 fs/xfs/xfs_icache.c      |  421 +++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h      |    6 +-
 fs/xfs/xfs_iget.c        |  455 ----------------------------------------------
 fs/xfs/xfs_inode.c       |    1 +
 fs/xfs/xfs_inode.h       |   10 +-
 fs/xfs/xfs_itable.c      |    1 +
 fs/xfs/xfs_log_recover.c |    1 +
 fs/xfs/xfs_qm.c          |    1 +
 fs/xfs/xfs_rtalloc.c     |    1 +
 fs/xfs/xfs_vnodeops.c    |    1 +
 12 files changed, 430 insertions(+), 470 deletions(-)
 delete mode 100644 fs/xfs/xfs_iget.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 442f256..e65357b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
 				   xfs_icache.o \
-				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 4267922..9b6e330 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index eba216f..9c8703b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -41,6 +41,421 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+				struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * if this didn't occur in transactions, we could use
+	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+	 * code up to do this anyway.
+	 */
+	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	if (!ip)
+		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
+
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(ip->i_ino == 0);
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_afp = NULL;
+	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+	ip->i_flags = 0;
+	ip->i_delayed_blks = 0;
+	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	if (ip->i_itemp) {
+		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_error;
+	}
+
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		trace_xfs_iget_reclaim(ip);
+
+		/*
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
+		 */
+		ip->i_flags |= XFS_IRECLAIM;
+
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		error = -inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			rcu_read_lock();
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+			trace_xfs_iget_reclaim_fail(ip);
+			goto out_error;
+		}
+
+		spin_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+
+		/*
+		 * Clear the per-lifetime state in the inode as we are now
+		 * effectively a new inode and need to return to the initial
+		 * state before reuse occurs.
+		 */
+		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&pag->pag_ici_lock);
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
+			error = EAGAIN;
+			goto out_error;
+		}
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+	XFS_STATS_INC(xs_ig_found);
+
+	return 0;
+
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+	int			iflags;
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return ENOMEM;
+
+	error = xfs_iread(mp, tp, ip, flags);
+	if (error)
+		goto out_destroy;
+
+	trace_xfs_iget_miss(ip);
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_destroy;
+	}
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region. Since we can be called from transaction context, don't
+	 * recurse into the file system.
+	 */
+	if (radix_tree_preload(GFP_NOFS)) {
+		error = EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	iflags = XFS_INEW;
+	if (flags & XFS_IGET_DONTCACHE)
+		iflags |= XFS_IDONTCACHE;
+	ip->i_udquot = ip->i_gdquot = NULL;
+	xfs_iflags_set(ip, iflags);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(xs_ig_dup);
+		error = EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		flags,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+
+	/*
+	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+	 * doesn't get freed while it's being referenced during a
+	 * radix tree traversal here.  It assumes this function
+	 * aqcuires only the ILOCK (and therefore it has no need to
+	 * involve the IOLOCK in this synchronization).
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return EINVAL;
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		XFS_STATS_INC(xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	/*
+	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+		xfs_setup_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (error == EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
  * radix tree lookups to a minimum. The batch size is a trade off between
@@ -253,7 +668,7 @@ xfs_reclaim_worker(
 	xfs_reclaim_work_queue(mp);
 }
 
-void
+static void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
@@ -319,7 +734,7 @@ __xfs_inode_clear_reclaim(
 	}
 }
 
-void
+STATIC void
 __xfs_inode_clear_reclaim_tag(
 	xfs_mount_t	*mp,
 	xfs_perag_t	*pag,
@@ -542,7 +957,7 @@ out:
  * then a shut down during filesystem unmount reclaim walk leak all the
  * unreclaimed inodes.
  */
-int
+STATIC int
 xfs_reclaim_inodes_ag(
 	struct xfs_mount	*mp,
 	int			flags,
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 0ba9c89..222e22f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,6 +24,9 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+	     uint flags, uint lock_flags, xfs_inode_t **ipp);
+
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
@@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index ea9a5fa..0000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-	struct xfs_mount	*mp,
-	xfs_ino_t		ino)
-{
-	struct xfs_inode	*ip;
-
-	/*
-	 * if this didn't occur in transactions, we could use
-	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-	 * code up to do this anyway.
-	 */
-	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-	if (!ip)
-		return NULL;
-	if (inode_init_always(mp->m_super, VFS_I(ip))) {
-		kmem_zone_free(xfs_inode_zone, ip);
-		return NULL;
-	}
-
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-	ASSERT(ip->i_ino == 0);
-
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-	/* initialise the xfs inode */
-	ip->i_ino = ino;
-	ip->i_mount = mp;
-	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-	ip->i_afp = NULL;
-	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-	ip->i_flags = 0;
-	ip->i_delayed_blks = 0;
-	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
-	return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
-	struct rcu_head		*head)
-{
-	struct inode		*inode = container_of(head, struct inode, i_rcu);
-	struct xfs_inode	*ip = XFS_I(inode);
-
-	kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-	struct xfs_inode	*ip)
-{
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		xfs_idestroy_fork(ip, XFS_DATA_FORK);
-		break;
-	}
-
-	if (ip->i_afp)
-		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-	if (ip->i_itemp) {
-		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-		xfs_inode_item_destroy(ip);
-		ip->i_itemp = NULL;
-	}
-
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-
-	/*
-	 * Because we use RCU freeing we need to ensure the inode always
-	 * appears to be reclaimed with an invalid inode number when in the
-	 * free state. The ip->i_flags_lock provides the barrier against lookup
-	 * races.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ip->i_flags = XFS_IRECLAIM;
-	ip->i_ino = 0;
-	spin_unlock(&ip->i_flags_lock);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip,
-	xfs_ino_t		ino,
-	int			flags,
-	int			lock_flags) __releases(RCU)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct xfs_mount	*mp = ip->i_mount;
-	int			error;
-
-	/*
-	 * check for re-use of an inode within an RCU grace period due to the
-	 * radix tree nodes not being updated yet. We monitor for this by
-	 * setting the inode number to zero before freeing the inode structure.
-	 * If the inode has been reallocated and set up, then the inode number
-	 * will not match, so check for that, too.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (ip->i_ino != ino) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-
-	/*
-	 * If we are racing with another cache hit that is currently
-	 * instantiating this inode or currently recycling it out of
-	 * reclaimabe state, wait for the initialisation to complete
-	 * before continuing.
-	 *
-	 * XXX(hch): eventually we should do something equivalent to
-	 *	     wait_on_inode to wait for these flags to be cleared
-	 *	     instead of polling for it.
-	 */
-	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-	/*
-	 * If lookup is racing with unlink return an error immediately.
-	 */
-	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_error;
-	}
-
-	/*
-	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-	 * Need to carefully get it back into useable state.
-	 */
-	if (ip->i_flags & XFS_IRECLAIMABLE) {
-		trace_xfs_iget_reclaim(ip);
-
-		/*
-		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-		 * from stomping over us while we recycle the inode.  We can't
-		 * clear the radix tree reclaimable tag yet as it requires
-		 * pag_ici_lock to be held exclusive.
-		 */
-		ip->i_flags |= XFS_IRECLAIM;
-
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-
-		error = -inode_init_always(mp->m_super, inode);
-		if (error) {
-			/*
-			 * Re-initializing the inode failed, and we are in deep
-			 * trouble.  Try to re-add it to the reclaim list.
-			 */
-			rcu_read_lock();
-			spin_lock(&ip->i_flags_lock);
-
-			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-			trace_xfs_iget_reclaim_fail(ip);
-			goto out_error;
-		}
-
-		spin_lock(&pag->pag_ici_lock);
-		spin_lock(&ip->i_flags_lock);
-
-		/*
-		 * Clear the per-lifetime state in the inode as we are now
-		 * effectively a new inode and need to return to the initial
-		 * state before reuse occurs.
-		 */
-		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-		ip->i_flags |= XFS_INEW;
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-		inode->i_state = I_NEW;
-
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-		spin_unlock(&ip->i_flags_lock);
-		spin_unlock(&pag->pag_ici_lock);
-	} else {
-		/* If the VFS inode is being torn down, pause and try again. */
-		if (!igrab(inode)) {
-			trace_xfs_iget_skip(ip);
-			error = EAGAIN;
-			goto out_error;
-		}
-
-		/* We've got a live one. */
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-		trace_xfs_iget_hit(ip);
-	}
-
-	if (lock_flags != 0)
-		xfs_ilock(ip, lock_flags);
-
-	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-	XFS_STATS_INC(xs_ig_found);
-
-	return 0;
-
-out_error:
-	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
-	return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	xfs_trans_t		*tp,
-	xfs_ino_t		ino,
-	struct xfs_inode	**ipp,
-	int			flags,
-	int			lock_flags)
-{
-	struct xfs_inode	*ip;
-	int			error;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
-	int			iflags;
-
-	ip = xfs_inode_alloc(mp, ino);
-	if (!ip)
-		return ENOMEM;
-
-	error = xfs_iread(mp, tp, ip, flags);
-	if (error)
-		goto out_destroy;
-
-	trace_xfs_iget_miss(ip);
-
-	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_destroy;
-	}
-
-	/*
-	 * Preload the radix tree so we can insert safely under the
-	 * write spinlock. Note that we cannot sleep inside the preload
-	 * region. Since we can be called from transaction context, don't
-	 * recurse into the file system.
-	 */
-	if (radix_tree_preload(GFP_NOFS)) {
-		error = EAGAIN;
-		goto out_destroy;
-	}
-
-	/*
-	 * Because the inode hasn't been added to the radix-tree yet it can't
-	 * be found by another thread, so we can do the non-sleeping lock here.
-	 */
-	if (lock_flags) {
-		if (!xfs_ilock_nowait(ip, lock_flags))
-			BUG();
-	}
-
-	/*
-	 * These values must be set before inserting the inode into the radix
-	 * tree as the moment it is inserted a concurrent lookup (allowed by the
-	 * RCU locking mechanism) can find it and that lookup must see that this
-	 * is an inode currently under construction (i.e. that XFS_INEW is set).
-	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-	 * memory barrier that ensures this detection works correctly at lookup
-	 * time.
-	 */
-	iflags = XFS_INEW;
-	if (flags & XFS_IGET_DONTCACHE)
-		iflags |= XFS_IDONTCACHE;
-	ip->i_udquot = ip->i_gdquot = NULL;
-	xfs_iflags_set(ip, iflags);
-
-	/* insert the new inode */
-	spin_lock(&pag->pag_ici_lock);
-	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-	if (unlikely(error)) {
-		WARN_ON(error != -EEXIST);
-		XFS_STATS_INC(xs_ig_dup);
-		error = EAGAIN;
-		goto out_preload_end;
-	}
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-
-	*ipp = ip;
-	return 0;
-
-out_preload_end:
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-	if (lock_flags)
-		xfs_iunlock(ip, lock_flags);
-out_destroy:
-	__destroy_inode(VFS_I(ip));
-	xfs_inode_free(ip);
-	return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *		 for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-	xfs_mount_t	*mp,
-	xfs_trans_t	*tp,
-	xfs_ino_t	ino,
-	uint		flags,
-	uint		lock_flags,
-	xfs_inode_t	**ipp)
-{
-	xfs_inode_t	*ip;
-	int		error;
-	xfs_perag_t	*pag;
-	xfs_agino_t	agino;
-
-	/*
-	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-	 * doesn't get freed while it's being referenced during a
-	 * radix tree traversal here.  It assumes this function
-	 * aqcuires only the ILOCK (and therefore it has no need to
-	 * involve the IOLOCK in this synchronization).
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
-	/* reject inode numbers outside existing AGs */
-	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-		return EINVAL;
-
-	/* get the perag structure and ensure that it's inode capable */
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-	agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
-	error = 0;
-	rcu_read_lock();
-	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
-	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	} else {
-		rcu_read_unlock();
-		XFS_STATS_INC(xs_ig_missed);
-
-		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-							flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	}
-	xfs_perag_put(pag);
-
-	*ipp = ip;
-
-	/*
-	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
-	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
-	 */
-	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-		xfs_setup_inode(ip);
-	return 0;
-
-out_error_or_again:
-	if (error == EAGAIN) {
-		delay(1);
-		goto again;
-	}
-	xfs_perag_put(pag);
-	return error;
-}
-
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ba404e4..bba8f37 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a12fe18..da69c18 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -506,11 +506,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
 	 ((pip)->i_d.di_mode & S_ISGID))
 
+
 /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
  */
-int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-			 uint, uint, xfs_inode_t **);
 void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
@@ -518,11 +517,6 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void		xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a6..3998fd2 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xfs_internal_inum(
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace..651c988 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -42,6 +42,7 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xlog_find_zeroed(
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0..48c750b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4b..a69e0b4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c6373..e9cae13 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The maximum pathlen is 1024 bytes. Since the minimum file system
-- 
1.7.10

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 02/13] xfs: rationalise xfs_mount_wq users
  2012-09-28  4:44 ` [PATCH 02/13] xfs: rationalise xfs_mount_wq users Dave Chinner
@ 2012-09-28 12:27   ` Christoph Hellwig
  0 siblings, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-28 12:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only
  2012-09-28  4:44 ` [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only Dave Chinner
@ 2012-09-28 12:27   ` Christoph Hellwig
  0 siblings, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-28 12:27 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 04/13] xfs: sync work is now only periodic log work
  2012-09-28  4:44 ` [PATCH 04/13] xfs: sync work is now only periodic log work Dave Chinner
@ 2012-09-28 12:31   ` Christoph Hellwig
  0 siblings, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-28 12:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Fri, Sep 28, 2012 at 02:44:36PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The only thing the periodic sync work does now is flush the AIL and
> idle the log. These are really functions of the log code, so move
> the work to xfs_log.c and rename it appropriately.
> 
> The only wart that this leaves behind is the xfssyncd_centisecs
> sysctl, otherwise the xfssyncd is dead. Clean up any comments that
> related to xfssyncd to reflect it's passing.

You'll get an OK for me as I'd like this series in 3.7, but I'd really
prefer if we could stop waking up xfsaild from here ASAP, and instead
just do a schedule_timeout in xfsaild where we currently sleep forever.

Any chance to get a follow on patch for that?


> +#include "xfs_fsops.h"
>  
>  kmem_zone_t	*xfs_log_ticket_zone;
>  
> @@ -698,6 +699,8 @@ xfs_log_mount_finish(xfs_mount_t *mp)
>  		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
>  	}
>  
> +	xfs_log_work_queue(mp);
> +
>  	return error;

I think this should only be called when xlog_recover_finish didn't
return an error.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
@ 2012-09-28 12:34   ` Christoph Hellwig
  2012-10-01 17:44     ` Ben Myers
  2012-09-28 21:44   ` Mark Tinguely
  2012-10-01 20:14   ` Brian Foster
  2 siblings, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-28 12:34 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

>  /*
> + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK

slightly too long line.

> + * or a page lock.
> + */
> +static inline void
> +xfs_flush_inodes(struct xfs_inode *ip)
> +{
> +	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> +}

Passing an inode here seems very odd, an xfs_mount would be the proper
interface. 

Looks fine modulo these cosmetic changes, so:

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 07/13] xfs: syncd workqueue is no more
  2012-09-28  4:44 ` [PATCH 07/13] xfs: syncd workqueue is no more Dave Chinner
@ 2012-09-28 12:35   ` Christoph Hellwig
  2012-09-28 18:17   ` Mark Tinguely
  1 sibling, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2012-09-28 12:35 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On Fri, Sep 28, 2012 at 02:44:39PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> With the syncd functions moved to the log and/or removed, the syncd
> workqueue is the only remaining bit left. It is used by the log
> covering/ail pushing work, as well as by the inode reclaim work.
> 
> Given how cheap workqueues are these days, give the log and inode
> reclaim work their own work queues and kill the syncd work queue.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 07/13] xfs: syncd workqueue is no more
  2012-09-28  4:44 ` [PATCH 07/13] xfs: syncd workqueue is no more Dave Chinner
  2012-09-28 12:35   ` Christoph Hellwig
@ 2012-09-28 18:17   ` Mark Tinguely
  2012-10-01 17:54     ` Ben Myers
  1 sibling, 1 reply; 41+ messages in thread
From: Mark Tinguely @ 2012-09-28 18:17 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/27/12 23:44, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> With the syncd functions moved to the log and/or removed, the syncd
> workqueue is the only remaining bit left. It is used by the log
> covering/ail pushing work, as well as by the inode reclaim work.
>
> Given how cheap workqueues are these days, give the log and inode
> reclaim work their own work queues and kill the syncd work queue.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---

<deleted info>

> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index fed1eb2..65cf42c 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -776,8 +776,23 @@ xfs_init_mount_workqueues(
>   			WQ_MEM_RECLAIM, 0, mp->m_fsname);
>   	if (!mp->m_cil_workqueue)
>   		goto out_destroy_unwritten;
> +
> +	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
> +			WQ_NON_REENTRANT, 0, mp->m_fsname);
> +	if (!mp->m_reclaim_workqueue)
> +		goto out_destroy_cil;
> +
> +	mp->m_log_workqueue = alloc_workqueue("xfs-reclaim/%s",
> +			WQ_NON_REENTRANT, 0, mp->m_fsname);


Cut/paste error for the worker name?

Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
  2012-09-28 12:34   ` Christoph Hellwig
@ 2012-09-28 21:44   ` Mark Tinguely
  2012-10-01 20:14   ` Brian Foster
  2 siblings, 0 replies; 41+ messages in thread
From: Mark Tinguely @ 2012-09-28 21:44 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 09/27/12 23:44, Dave Chinner wrote:
> From: Dave Chinner<dchinner@redhat.com>
>
> We don't do any data writeback from XFS any more - the VFS is
> completely responsible for that, including for freeze. We can
> replace the remaining caller with the VFS level function that
> achieves the same thing, but without conflicting with current
> writeback work - writeback_inodes_sb_if_idle().
>
> This means we can remove the flush_work and xfs_flush_inodes() - the
> VFS functionality completely replaces the internal flush queue for
> doing this writeback work in a separate context to avoid stack
> overruns.
>
> This does have one complication - it cannot be called with page
> locks held.  Hence move the flushing of delalloc space when ENOSPC
> occurs back up into xfs_file_aio_buffered_write when we don't hold
> any locks that will stall writeback.
>
> Note that we always need to pass a count of zero to
> generic_file_buffered_write() as the previously written byte count.
> We only do this by accident before this patch by the virtue of ret
> always being zero when there are no errors. Make this explicit
> rather than needing to specifically zero ret in the ENOSPC retry
> case.
>
> Signed-off-by: Dave Chinner<dchinner@redhat.com>
> ---


Looks good.

Reviewed-by: Mark Tinguely <tinguely@sgi.com>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-09-28 12:34   ` Christoph Hellwig
@ 2012-10-01 17:44     ` Ben Myers
  2012-10-02  0:14       ` Dave Chinner
  0 siblings, 1 reply; 41+ messages in thread
From: Ben Myers @ 2012-10-01 17:44 UTC (permalink / raw)
  To: Christoph Hellwig, Dave Chinner; +Cc: xfs

Hi Dave,

On Fri, Sep 28, 2012 at 08:34:43AM -0400, Christoph Hellwig wrote:
> >  /*
> > + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
> 
> slightly too long line.

It comes out at 78 columns when the patch is applied.  Not a problem.

> > + * or a page lock.
> > + */
> > +static inline void
> > +xfs_flush_inodes(struct xfs_inode *ip)
> > +{
> > +	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> > +}
> 
> Passing an inode here seems very odd, an xfs_mount would be the proper
> interface. 
> 
> Looks fine modulo these cosmetic changes, so:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>

I cleaned up these cosmetic changes on your behalf.  Here is the result:


>From xfs-bounces@oss.sgi.com  Fri Sep 28 16:27:23 2012
From: Dave Chinner <david@fromorbit.com>
To: xfs@oss.sgi.com
Subject: [PATCH 06/13] xfs: xfs_sync_data is redundant.
Date: Fri, 28 Sep 2012 14:44:38 +1000
Message-Id: <1348807485-20165-7-git-send-email-david@fromorbit.com>
X-Patchwork-Id: 4147

From: Dave Chinner <dchinner@redhat.com>

We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with the VFS level function that
achieves the same thing, but without conflicting with current
writeback work - writeback_inodes_sb_if_idle().

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns.

This does have one complication - it cannot be called with page
locks held.  Hence move the flushing of delalloc space when ENOSPC
occurs back up into xfs_file_aio_buffered_write when we don't hold
any locks that will stall writeback.

Note that we always need to pass a count of zero to
generic_file_buffered_write() as the previously written byte count.
We only do this by accident before this patch by the virtue of ret
always being zero when there are no errors. Make this explicit
rather than needing to specifically zero ret in the ENOSPC retry
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>

---
 fs/xfs/xfs_file.c  |   13 +++++----
 fs/xfs/xfs_inode.h |   10 +++++++
 fs/xfs/xfs_iomap.c |   23 +++++-----------
 fs/xfs/xfs_mount.h |    1 -
 fs/xfs/xfs_super.c |    3 --
 fs/xfs/xfs_sync.c  |   78 ----------------------------------------------------
 fs/xfs/xfs_sync.h  |    3 --
 7 files changed, 24 insertions(+), 107 deletions(-)

Index: xfs/fs/xfs/xfs_file.c
===================================================================
--- xfs.orig/fs/xfs/xfs_file.c
+++ xfs/fs/xfs/xfs_file.c
@@ -728,16 +728,17 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, &iocb->ki_pos, count, ret);
+			pos, &iocb->ki_pos, count, 0);
+
 	/*
-	 * if we just got an ENOSPC, flush the inode now we aren't holding any
-	 * page locks and retry *once*
+	 * If we just got an ENOSPC, try to write back all dirty inodes to
+	 * convert delalloc space to free up some of the excess reserved
+	 * metadata space.
 	 */
 	if (ret == -ENOSPC && !enospc) {
 		enospc = 1;
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (!ret)
-			goto write_retry;
+		xfs_flush_inodes(ip->i_mount);
+		goto write_retry;
 	}
 
 	current->backing_dev_info = NULL;
Index: xfs/fs/xfs/xfs_iomap.c
===================================================================
--- xfs.orig/fs/xfs/xfs_iomap.c
+++ xfs/fs/xfs/xfs_iomap.c
@@ -373,7 +373,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, flushed = 0;
+	int		prealloc;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,26 +434,17 @@ retry:
 	}
 
 	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
-	 * some of the excess reserved metadata space. For both cases, retry
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
 	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (flushed)
-			return XFS_ERROR(error ? error : ENOSPC);
-
-		if (error == ENOSPC) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (prealloc) {
+			prealloc = 0;
+			error = 0;
+			goto retry;
 		}
-
-		flushed = 1;
-		error = 0;
-		prealloc = 0;
-		goto retry;
+		return XFS_ERROR(error ? error : ENOSPC);
 	}
 
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
Index: xfs/fs/xfs/xfs_mount.h
===================================================================
--- xfs.orig/fs/xfs/xfs_mount.h
+++ xfs/fs/xfs/xfs_mount.h
@@ -198,7 +198,6 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
@@ -381,6 +380,16 @@ extern int	xfs_dev_is_read_only(struct x
 
 extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock.
+ */
+static inline void
+xfs_flush_inodes(struct xfs_mount *mp)
+{
+	writeback_inodes_sb_if_idle(mp->m_super, WB_REASON_FS_FREE_SPACE);
+}
+
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
Index: xfs/fs/xfs/xfs_super.c
===================================================================
--- xfs.orig/fs/xfs/xfs_super.c
+++ xfs/fs/xfs/xfs_super.c
@@ -918,8 +918,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_work_sync(&mp->m_flush_work);
-
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 
@@ -1235,7 +1233,6 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
Index: xfs/fs/xfs/xfs_sync.c
===================================================================
--- xfs.orig/fs/xfs/xfs_sync.c
+++ xfs/fs/xfs/xfs_sync.c
@@ -217,51 +217,6 @@ xfs_inode_ag_iterator(
 }
 
 STATIC int
-xfs_sync_inode_data(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			flags)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct address_space *mapping = inode->i_mapping;
-	int			error = 0;
-
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-		if (flags & SYNC_TRYLOCK)
-			return 0;
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XBF_ASYNC, FI_NONE);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-	struct xfs_mount	*mp,
-	int			flags)
-{
-	int			error;
-
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-	if (error)
-		return XFS_ERROR(error);
-
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return 0;
-}
-
-STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
 {
@@ -415,39 +370,6 @@ xfs_reclaim_worker(
 	xfs_syncd_queue_reclaim(mp);
 }
 
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	queue_work(xfs_syncd_wq, &mp->m_flush_work);
-	flush_work_sync(&mp->m_flush_work);
-}
-
-void
-xfs_flush_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(work,
-					struct xfs_mount, m_flush_work);
-
-	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
Index: xfs/fs/xfs/xfs_sync.h
===================================================================
--- xfs.orig/fs/xfs/xfs_sync.h
+++ xfs/fs/xfs/xfs_sync.h
@@ -26,14 +26,11 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inodes(struct xfs_inode *ip);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
Index: xfs/fs/xfs/xfs_vnodeops.c
===================================================================
--- xfs.orig/fs/xfs/xfs_vnodeops.c
+++ xfs/fs/xfs/xfs_vnodeops.c
@@ -777,7 +777,7 @@ xfs_create(
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
-		xfs_flush_inodes(dp);
+		xfs_flush_inodes(dp->i_mount);
 		error = xfs_trans_reserve(tp, resblks, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);
 	}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 07/13] xfs: syncd workqueue is no more
  2012-09-28 18:17   ` Mark Tinguely
@ 2012-10-01 17:54     ` Ben Myers
  0 siblings, 0 replies; 41+ messages in thread
From: Ben Myers @ 2012-10-01 17:54 UTC (permalink / raw)
  To: Mark Tinguely; +Cc: xfs

On Fri, Sep 28, 2012 at 01:17:07PM -0500, Mark Tinguely wrote:
> On 09/27/12 23:44, Dave Chinner wrote:
> >From: Dave Chinner<dchinner@redhat.com>
> >
> >With the syncd functions moved to the log and/or removed, the syncd
> >workqueue is the only remaining bit left. It is used by the log
> >covering/ail pushing work, as well as by the inode reclaim work.
> >
> >Given how cheap workqueues are these days, give the log and inode
> >reclaim work their own work queues and kill the syncd work queue.
> >
> >Signed-off-by: Dave Chinner<dchinner@redhat.com>
> >---
> 
> <deleted info>
> 
> >diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> >index fed1eb2..65cf42c 100644
> >--- a/fs/xfs/xfs_super.c
> >+++ b/fs/xfs/xfs_super.c
> >@@ -776,8 +776,23 @@ xfs_init_mount_workqueues(
> >  			WQ_MEM_RECLAIM, 0, mp->m_fsname);
> >  	if (!mp->m_cil_workqueue)
> >  		goto out_destroy_unwritten;
> >+
> >+	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
> >+			WQ_NON_REENTRANT, 0, mp->m_fsname);
> >+	if (!mp->m_reclaim_workqueue)
> >+		goto out_destroy_cil;
> >+
> >+	mp->m_log_workqueue = alloc_workqueue("xfs-reclaim/%s",
> >+			WQ_NON_REENTRANT, 0, mp->m_fsname);

Changed this to "xfs-log/%s" on Dave's behalf.

-Ben

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
  2012-09-28 12:34   ` Christoph Hellwig
  2012-09-28 21:44   ` Mark Tinguely
@ 2012-10-01 20:14   ` Brian Foster
  2012-10-01 21:31     ` Mark Tinguely
                       ` (2 more replies)
  2 siblings, 3 replies; 41+ messages in thread
From: Brian Foster @ 2012-10-01 20:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

[-- Attachment #1: Type: text/plain, Size: 9910 bytes --]

On 09/28/2012 12:44 AM, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> We don't do any data writeback from XFS any more - the VFS is
> completely responsible for that, including for freeze. We can
> replace the remaining caller with the VFS level function that
> achieves the same thing, but without conflicting with current
> writeback work - writeback_inodes_sb_if_idle().
> 
> This means we can remove the flush_work and xfs_flush_inodes() - the
> VFS functionality completely replaces the internal flush queue for
> doing this writeback work in a separate context to avoid stack
> overruns.
> 
> This does have one complication - it cannot be called with page
> locks held.  Hence move the flushing of delalloc space when ENOSPC
> occurs back up into xfs_file_aio_buffered_write when we don't hold
> any locks that will stall writeback.
> 
> Note that we always need to pass a count of zero to
> generic_file_buffered_write() as the previously written byte count.
> We only do this by accident before this patch by the virtue of ret
> always being zero when there are no errors. Make this explicit
> rather than needing to specifically zero ret in the ENOSPC retry
> case.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Heads up... I was doing some testing against my eofblocks set rebased
against this patchset and I'm reproducing a new 273 failure. The failure
bisects down to this patch.

With the bisection, I'm running xfs top of tree plus the following patch:

xfs: only update the last_sync_lsn when a transaction completes

... and patches 1-6 of this set on top of that. i.e.:

xfs: xfs_sync_data is redundant.
xfs: Bring some sanity to log unmounting
xfs: sync work is now only periodic log work
xfs: don't run the sync work if the filesystem is read-only
xfs: rationalise xfs_mount_wq users
xfs: xfs_syncd_stop must die
xfs: only update the last_sync_lsn when a transaction completes
xfs: Make inode32 a remountable option

This is on a 16p (according to /proc/cpuinfo) x86-64 system with 32GB
RAM. The test and scratch volumes are both 500GB lvm volumes on top of a
hardware raid. I haven't looked into this at all yet but I wanted to
drop it on the list for now. The 273 output is attached.

Brian

> ---
>  fs/xfs/xfs_file.c  |   13 +++++----
>  fs/xfs/xfs_inode.h |   10 +++++++
>  fs/xfs/xfs_iomap.c |   23 +++++-----------
>  fs/xfs/xfs_mount.h |    1 -
>  fs/xfs/xfs_super.c |    3 --
>  fs/xfs/xfs_sync.c  |   78 ----------------------------------------------------
>  fs/xfs/xfs_sync.h  |    3 --
>  7 files changed, 24 insertions(+), 107 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1eaeb8b..2cadcf7 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -728,16 +728,17 @@ xfs_file_buffered_aio_write(
>  write_retry:
>  	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
>  	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
> -			pos, &iocb->ki_pos, count, ret);
> +			pos, &iocb->ki_pos, count, 0);
> +
>  	/*
> -	 * if we just got an ENOSPC, flush the inode now we aren't holding any
> -	 * page locks and retry *once*
> +	 * If we just got an ENOSPC, try to write back all dirty inodes to
> +	 * convert delalloc space to free up some of the excess reserved
> +	 * metadata space.
>  	 */
>  	if (ret == -ENOSPC && !enospc) {
>  		enospc = 1;
> -		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
> -		if (!ret)
> -			goto write_retry;
> +		xfs_flush_inodes(ip);
> +		goto write_retry;
>  	}
>  
>  	current->backing_dev_info = NULL;
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 94b32f9..a12fe18 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -288,6 +288,16 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
>  }
>  
>  /*
> + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
> + * or a page lock.
> + */
> +static inline void
> +xfs_flush_inodes(struct xfs_inode *ip)
> +{
> +	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> +}
> +
> +/*
>   * i_flags helper functions
>   */
>  static inline void
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 973dff6..f858b90 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -373,7 +373,7 @@ xfs_iomap_write_delay(
>  	xfs_extlen_t	extsz;
>  	int		nimaps;
>  	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
> -	int		prealloc, flushed = 0;
> +	int		prealloc;
>  	int		error;
>  
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> @@ -434,26 +434,17 @@ retry:
>  	}
>  
>  	/*
> -	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
> -	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
> -	 * some of the excess reserved metadata space. For both cases, retry
> +	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
>  	 * without EOF preallocation.
>  	 */
>  	if (nimaps == 0) {
>  		trace_xfs_delalloc_enospc(ip, offset, count);
> -		if (flushed)
> -			return XFS_ERROR(error ? error : ENOSPC);
> -
> -		if (error == ENOSPC) {
> -			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -			xfs_flush_inodes(ip);
> -			xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		if (prealloc) {
> +			prealloc = 0;
> +			error = 0;
> +			goto retry;
>  		}
> -
> -		flushed = 1;
> -		error = 0;
> -		prealloc = 0;
> -		goto retry;
> +		return XFS_ERROR(error ? error : ENOSPC);
>  	}
>  
>  	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 26e46ae..a54b5aa 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -198,7 +198,6 @@ typedef struct xfs_mount {
>  #endif
>  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
>  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> -	struct work_struct	m_flush_work;	/* background inode flush */
>  	__int64_t		m_update_flags;	/* sb flags we need to update
>  						   on the next remount,rw */
>  	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b85ca2d..fed1eb2 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -918,8 +918,6 @@ xfs_fs_put_super(
>  {
>  	struct xfs_mount	*mp = XFS_M(sb);
>  
> -	cancel_work_sync(&mp->m_flush_work);
> -
>  	xfs_filestream_unmount(mp);
>  	xfs_unmountfs(mp);
>  
> @@ -1231,7 +1229,6 @@ xfs_fs_fill_super(
>  	spin_lock_init(&mp->m_sb_lock);
>  	mutex_init(&mp->m_growlock);
>  	atomic_set(&mp->m_active_trans, 0);
> -	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
>  	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
>  
>  	mp->m_super = sb;
> diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
> index 7527610..6a2ada3 100644
> --- a/fs/xfs/xfs_sync.c
> +++ b/fs/xfs/xfs_sync.c
> @@ -217,51 +217,6 @@ xfs_inode_ag_iterator(
>  }
>  
>  STATIC int
> -xfs_sync_inode_data(
> -	struct xfs_inode	*ip,
> -	struct xfs_perag	*pag,
> -	int			flags)
> -{
> -	struct inode		*inode = VFS_I(ip);
> -	struct address_space *mapping = inode->i_mapping;
> -	int			error = 0;
> -
> -	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> -		return 0;
> -
> -	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
> -		if (flags & SYNC_TRYLOCK)
> -			return 0;
> -		xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	}
> -
> -	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
> -				0 : XBF_ASYNC, FI_NONE);
> -	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
> -	return error;
> -}
> -
> -/*
> - * Write out pagecache data for the whole filesystem.
> - */
> -STATIC int
> -xfs_sync_data(
> -	struct xfs_mount	*mp,
> -	int			flags)
> -{
> -	int			error;
> -
> -	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
> -
> -	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
> -	if (error)
> -		return XFS_ERROR(error);
> -
> -	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
> -	return 0;
> -}
> -
> -STATIC int
>  xfs_sync_fsdata(
>  	struct xfs_mount	*mp)
>  {
> @@ -415,39 +370,6 @@ xfs_reclaim_worker(
>  	xfs_syncd_queue_reclaim(mp);
>  }
>  
> -/*
> - * Flush delayed allocate data, attempting to free up reserved space
> - * from existing allocations.  At this point a new allocation attempt
> - * has failed with ENOSPC and we are in the process of scratching our
> - * heads, looking about for more room.
> - *
> - * Queue a new data flush if there isn't one already in progress and
> - * wait for completion of the flush. This means that we only ever have one
> - * inode flush in progress no matter how many ENOSPC events are occurring and
> - * so will prevent the system from bogging down due to every concurrent
> - * ENOSPC event scanning all the active inodes in the system for writeback.
> - */
> -void
> -xfs_flush_inodes(
> -	struct xfs_inode	*ip)
> -{
> -	struct xfs_mount	*mp = ip->i_mount;
> -
> -	queue_work(xfs_syncd_wq, &mp->m_flush_work);
> -	flush_work_sync(&mp->m_flush_work);
> -}
> -
> -void
> -xfs_flush_worker(
> -	struct work_struct *work)
> -{
> -	struct xfs_mount *mp = container_of(work,
> -					struct xfs_mount, m_flush_work);
> -
> -	xfs_sync_data(mp, SYNC_TRYLOCK);
> -	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
> -}
> -
>  void
>  __xfs_inode_set_reclaim_tag(
>  	struct xfs_perag	*pag,
> diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
> index 8d58fab..0018e84 100644
> --- a/fs/xfs/xfs_sync.h
> +++ b/fs/xfs/xfs_sync.h
> @@ -26,14 +26,11 @@ struct xfs_perag;
>  
>  extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
>  
> -void xfs_flush_worker(struct work_struct *work);
>  void xfs_reclaim_worker(struct work_struct *work);
>  
>  int xfs_quiesce_data(struct xfs_mount *mp);
>  void xfs_quiesce_attr(struct xfs_mount *mp);
>  
> -void xfs_flush_inodes(struct xfs_inode *ip);
> -
>  int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
>  int xfs_reclaim_inodes_count(struct xfs_mount *mp);
>  void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
> 


[-- Attachment #2: 273.out.bad --]
[-- Type: text/plain, Size: 1865 bytes --]

QA output created by 273
------------------------------
start the workload
------------------------------
_porter 31 not complete
_porter 79 not complete
_porter 149 not complete_porter 74 not complete
_porter 161 not complete
_porter 54 not complete
_porter 98 not complete
_porter 99 not complete
_porter 167 not complete
_porter 76 not complete
_porter 45 not complete
_porter 152 not complete
_porter 173 not complete_porter 24 not complete

_porter 6 not complete
_porter 104 not complete
_porter 117 not complete
_porter 181 not complete
_porter 30 not complete
_porter 148 not complete
_porter 147 not complete
_porter 77 not complete
_porter 111 not complete
_porter 56 not complete

_porter 49 not complete
_porter 165 not complete
_porter 97 not complete
_porter 158 not complete
_porter 157 not complete
_porter 179 not complete
_porter 191 not complete
_porter 57 not complete
_porter 123 not complete
_porter 160 not complete
_porter 100 not complete
_porter 9 not complete
_porter 95 not complete
_porter 10 not complete
_porter 53 not complete
_porter 73 not complete
_porter 27 not complete
_porter 4 not complete
_porter 5 not complete
_porter 39 not complete
_porter 43 not complete
_porter 13 not complete
_porter 48 not complete
_porter 35 not complete
_porter 70 not complete
_porter 29 not complete
_porter 7 not complete
_porter 71 not complete
_porter 93 not complete
_porter 78 not complete

_porter 135 not complete
_porter 174 not complete
_porter 80 not complete
_porter 102 not complete
_porter 183 not complete
_porter 185 not complete_porter 168 not complete
_porter 178 not complete
_porter 129 not complete
_porter 193 not complete
_porter 192 not complete
_porter 199 not complete
_porter 120 not complete
_porter 125 not complete
_porter 126 not complete
_porter 145 not complete
_porter 124 not complete
_porter 172 not complete

[-- Attachment #3: Type: text/plain, Size: 121 bytes --]

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-01 20:14   ` Brian Foster
@ 2012-10-01 21:31     ` Mark Tinguely
  2012-10-02  0:10     ` Dave Chinner
  2012-10-02 20:25     ` Ben Myers
  2 siblings, 0 replies; 41+ messages in thread
From: Mark Tinguely @ 2012-10-01 21:31 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

On 10/01/12 15:14, Brian Foster wrote:
<deletes by mt>
> Heads up... I was doing some testing against my eofblocks set rebased
> against this patchset and I'm reproducing a new 273 failure. The failure
> bisects down to this patch.
>
> With the bisection, I'm running xfs top of tree plus the following patch:
>
> xfs: only update the last_sync_lsn when a transaction completes
>
> ... and patches 1-6 of this set on top of that. i.e.:
>
> xfs: xfs_sync_data is redundant.
> xfs: Bring some sanity to log unmounting
> xfs: sync work is now only periodic log work
> xfs: don't run the sync work if the filesystem is read-only
> xfs: rationalise xfs_mount_wq users
> xfs: xfs_syncd_stop must die
> xfs: only update the last_sync_lsn when a transaction completes
> xfs: Make inode32 a remountable option
>
> This is on a 16p (according to /proc/cpuinfo) x86-64 system with 32GB
> RAM. The test and scratch volumes are both 500GB lvm volumes on top of a
> hardware raid. I haven't looked into this at all yet but I wanted to
> drop it on the list for now. The 273 output is attached.
>
> Brian

  <deletes by mt>
>
>
> 273.out.bad
>
>
> QA output created by 273
> ------------------------------
> start the workload
> ------------------------------
> _porter 31 not complete
> _porter 79 not complete
> _porter 149 not complete_porter 74 not complete
> _porter 161 not complete
> _porter 54 not complete
> _porter 98 not complete
> _porter 99 not complete
> _porter 167 not complete
> _porter 76 not complete
> _porter 45 not complete
> _porter 152 not complete
> _porter 173 not complete_porter 24 not complete

  <deletes by mt>

I see it too on a single machine. It looks like an interaction between 
patch 06 and the "...update the last_sync_lsn...".

I like the "...update the last_sync_lsn ..." patch because it fixes the 
"xlog_verify_tail_lsn: tail wrapped" and "xlog_verify_tail_lsn: ran out 
of log space" messages that I am getting on that machine.

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-01 20:14   ` Brian Foster
  2012-10-01 21:31     ` Mark Tinguely
@ 2012-10-02  0:10     ` Dave Chinner
  2012-10-02  0:44       ` Brian Foster
  2012-10-02 13:22       ` Christoph Hellwig
  2012-10-02 20:25     ` Ben Myers
  2 siblings, 2 replies; 41+ messages in thread
From: Dave Chinner @ 2012-10-02  0:10 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

Hi, Brian.

On Mon, Oct 01, 2012 at 04:14:40PM -0400, Brian Foster wrote:
> Warning: This message has had one or more attachments removed
> Warning: (273.out.bad).
> Warning: Please read the "boprocket-Attachment-Warning.txt" attachment(s) for more information.

Which says:

> At Mon Oct  1 20:14:58 2012 the virus scanner said:                
>    MailScanner: Attempt to hide real filename extension (273.out.bad)

Looks like your mailer did something wrong with the attachment....


> Heads up... I was doing some testing against my eofblocks set rebased
> against this patchset and I'm reproducing a new 273 failure. The failure
> bisects down to this patch.
> 
> With the bisection, I'm running xfs top of tree plus the following patch:
> 
> xfs: only update the last_sync_lsn when a transaction completes
> 
> ... and patches 1-6 of this set on top of that. i.e.:
> 
> xfs: xfs_sync_data is redundant.
> xfs: Bring some sanity to log unmounting
> xfs: sync work is now only periodic log work
> xfs: don't run the sync work if the filesystem is read-only
> xfs: rationalise xfs_mount_wq users
> xfs: xfs_syncd_stop must die
> xfs: only update the last_sync_lsn when a transaction completes
> xfs: Make inode32 a remountable option
> 
> This is on a 16p (according to /proc/cpuinfo) x86-64 system with 32GB
> RAM. The test and scratch volumes are both 500GB lvm volumes on top of a
> hardware raid.
> I haven't looked into this at all yet but I wanted to
> drop it on the list for now. The 273 output is attached.

I bet you had writes fail with ENOSPC - 201 * 426
= 85626 files of 8k each, that gives 685MB. When the test is
running, I see upwards of 1.5GB of space consumed, which then slowly
drops again as data files are closed and data is written.

Some of that space is specualtive preallocation (4k per file, I
think), but also a significant amount of it is metadata reservation
for delayed allocation (4 blocks per file, IIRC). If I've only got
2GB RAM on my machine, then writeback starts at 200MB written, and
so well before the fs runs out of space the metadata reservations
are being released.

I just upped the VM to 8GB RAM, and immediately I see the test
starting to fail. And this is in 273.full:

cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_141': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_142': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_1cp: cannot create regular filcp: cannot create regular file `/mnt/scratch/sub_198/origin/file_147': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_1cp: cannot create regular filcp: writing `/mnt/scratch/sub_198/origin/file_149': No space left cp: writing `/mnt/scratch/sub_156/origin/file_275': No space left on device
cp: failed to extencp: writing `/mnt/scratch/sub_198/origin/file_150': No space left cp: writing `/mnt/scratch/sub_156/origin/file_276': No space left on device
cp: failed to extencp: cannot create regular file `/mnt/scratch/sub_124/origin/file_3cp: cannot create regular filcp: writing `/mnt/scratch/sub_124/origin/file_378': No space left cp: writing `/mnt/scratch/sub_173/origin/file_250': No space left on device
cp: failed to extencp: writing `/mnt/scratch/sub_124/origin/file_379': No space left cp: cannot create regular file `/mnt/scratch/sub_173/origin/file_2cp: cannot create regular file `/mnt/scratch/sub_134/origin/file_337': No space left on device
cp: cannot create regular filcp: cannot create regular filcp: writing `/mnt/scratch/sub_159/origin/file_307': No space left on device
cp: failed to extend `/mnt/scratch/sub_159/origin/file_307': No space left on device
cp: writing `/mnt/scratch/sub_159/origin/file_308': No space left on device
cp: failed to extend `/mnt/scratch/sub_159/origin/file_308': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_309': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_310': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_311': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_312': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_313': No space left on device
cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_314': No space left on device
.....

So, turning off speculative preallocation via the allocsize mount
option doesn't fix the problem. IOWs, the problem is too much active
metadata reservation.  If we are caching 685MB, that's less than the
writeback thresholds of a large memory machine, so the metadata
reservations won't be trimmed at all until ENOSPC actually occurs
and writeback is then started.

The problem is that writeback_inodes_sb_if_idle() does not block if
there is already writeback in progress, so the callers just keep
hitting ENOSPC rather than being throttled waiting for delalloc
conversion.

The patch below should fix this - it changes xfs_flush_inodes() to
us sync_inodes_sb(), which will issue IO and block waiting for it to
complete, just like xfs_flush_inodes() used to. Indeed, it passes
again on my VM with 8GB RAM....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

xfs: make inode writeback at ENOSPC blocking.

From: Dave Chinner <dchinner@redhat.com>

writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc
conversion fast enough to prevent spurious ENOSPC whent here are
hundreds of writers, thousands of small files and GBs of free RAM.
Change this to use sync_sb_inodes() to block callers while we wait
for writeback like the previous xfs_flush_inodes implementation did.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_inode.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index da69c18..0ec7a46 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -294,7 +294,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 static inline void
 xfs_flush_inodes(struct xfs_inode *ip)
 {
-	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
+	sync_inodes_sb(VFS_I(ip)->i_sb);
 }
 
 /*

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-01 17:44     ` Ben Myers
@ 2012-10-02  0:14       ` Dave Chinner
  2012-10-02 19:39         ` Ben Myers
  0 siblings, 1 reply; 41+ messages in thread
From: Dave Chinner @ 2012-10-02  0:14 UTC (permalink / raw)
  To: Ben Myers; +Cc: Christoph Hellwig, xfs

On Mon, Oct 01, 2012 at 12:44:10PM -0500, Ben Myers wrote:
> Hi Dave,
> 
> On Fri, Sep 28, 2012 at 08:34:43AM -0400, Christoph Hellwig wrote:
> > >  /*
> > > + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
> > 
> > slightly too long line.
> 
> It comes out at 78 columns when the patch is applied.  Not a problem.
> 
> > > + * or a page lock.
> > > + */
> > > +static inline void
> > > +xfs_flush_inodes(struct xfs_inode *ip)
> > > +{
> > > +	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> > > +}
> > 
> > Passing an inode here seems very odd, an xfs_mount would be the proper
> > interface. 
> > 
> > Looks fine modulo these cosmetic changes, so:
> >
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> I cleaned up these cosmetic changes on your behalf.  Here is the result:

Looks good. Thanks for doing this, Ben.

FWIW, The patch I just posted to change
writeback_inodes_sb_if_idle() to sync_inodes_sb() will need
massaging to apply to your tree now. I'll update it if you push your
tree out to oss.sgi.com before taking it...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02  0:10     ` Dave Chinner
@ 2012-10-02  0:44       ` Brian Foster
  2012-10-02 13:01         ` Brian Foster
  2012-10-02 13:22       ` Christoph Hellwig
  1 sibling, 1 reply; 41+ messages in thread
From: Brian Foster @ 2012-10-02  0:44 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 10/01/2012 08:10 PM, Dave Chinner wrote:
> Hi, Brian.
> 
> On Mon, Oct 01, 2012 at 04:14:40PM -0400, Brian Foster wrote:
>> Warning: This message has had one or more attachments removed
>> Warning: (273.out.bad).
>> Warning: Please read the "boprocket-Attachment-Warning.txt" attachment(s) for more information.
> 
> Which says:
> 
>> At Mon Oct  1 20:14:58 2012 the virus scanner said:                
>>    MailScanner: Attempt to hide real filename extension (273.out.bad)
> 
> Looks like your mailer did something wrong with the attachment....
> 

Ugh, sorry. The output file was filled with messages like so:

_porter 31 not complete
_porter 79 not complete
_porter 149 not complete
_porter 74 not complete
_porter 161 not complete
_porter 54 not complete
_porter 98 not complete
_porter 99 not complete
...

> 
>> Heads up... I was doing some testing against my eofblocks set rebased
>> against this patchset and I'm reproducing a new 273 failure. The failure
>> bisects down to this patch.
>>
>> With the bisection, I'm running xfs top of tree plus the following patch:
>>
>> xfs: only update the last_sync_lsn when a transaction completes
>>
>> ... and patches 1-6 of this set on top of that. i.e.:
>>
>> xfs: xfs_sync_data is redundant.
>> xfs: Bring some sanity to log unmounting
>> xfs: sync work is now only periodic log work
>> xfs: don't run the sync work if the filesystem is read-only
>> xfs: rationalise xfs_mount_wq users
>> xfs: xfs_syncd_stop must die
>> xfs: only update the last_sync_lsn when a transaction completes
>> xfs: Make inode32 a remountable option
>>
>> This is on a 16p (according to /proc/cpuinfo) x86-64 system with 32GB
>> RAM. The test and scratch volumes are both 500GB lvm volumes on top of a
>> hardware raid.
>> I haven't looked into this at all yet but I wanted to
>> drop it on the list for now. The 273 output is attached.
> 
> I bet you had writes fail with ENOSPC - 201 * 426
> = 85626 files of 8k each, that gives 685MB. When the test is
> running, I see upwards of 1.5GB of space consumed, which then slowly
> drops again as data files are closed and data is written.
> 
> Some of that space is specualtive preallocation (4k per file, I
> think), but also a significant amount of it is metadata reservation
> for delayed allocation (4 blocks per file, IIRC). If I've only got
> 2GB RAM on my machine, then writeback starts at 200MB written, and
> so well before the fs runs out of space the metadata reservations
> are being released.
> 
> I just upped the VM to 8GB RAM, and immediately I see the test
> starting to fail. And this is in 273.full:
> 
> cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_141': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_142': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_1cp: cannot create regular filcp: cannot create regular file `/mnt/scratch/sub_198/origin/file_147': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_198/origin/file_1cp: cannot create regular filcp: writing `/mnt/scratch/sub_198/origin/file_149': No space left cp: writing `/mnt/scratch/sub_156/origin/file_275': No space left on device
> cp: failed to extencp: writing `/mnt/scratch/sub_198/origin/file_150': No space left cp: writing `/mnt/scratch/sub_156/origin/file_276': No space left on device
> cp: failed to extencp: cannot create regular file `/mnt/scratch/sub_124/origin/file_3cp: cannot create regular filcp: writing `/mnt/scratch/sub_124/origin/file_378': No space left cp: writing `/mnt/scratch/sub_173/origin/file_250': No space left on device
> cp: failed to extencp: writing `/mnt/scratch/sub_124/origin/file_379': No space left cp: cannot create regular file `/mnt/scratch/sub_173/origin/file_2cp: cannot create regular file `/mnt/scratch/sub_134/origin/file_337': No space left on device
> cp: cannot create regular filcp: cannot create regular filcp: writing `/mnt/scratch/sub_159/origin/file_307': No space left on device
> cp: failed to extend `/mnt/scratch/sub_159/origin/file_307': No space left on device
> cp: writing `/mnt/scratch/sub_159/origin/file_308': No space left on device
> cp: failed to extend `/mnt/scratch/sub_159/origin/file_308': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_309': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_310': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_311': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_312': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_313': No space left on device
> cp: cannot create regular file `/mnt/scratch/sub_159/origin/file_314': No space left on device
> .....
> 

Yep, I see the same thing...

> So, turning off speculative preallocation via the allocsize mount
> option doesn't fix the problem. IOWs, the problem is too much active
> metadata reservation.  If we are caching 685MB, that's less than the
> writeback thresholds of a large memory machine, so the metadata
> reservations won't be trimmed at all until ENOSPC actually occurs
> and writeback is then started.
> 
> The problem is that writeback_inodes_sb_if_idle() does not block if
> there is already writeback in progress, so the callers just keep
> hitting ENOSPC rather than being throttled waiting for delalloc
> conversion.
> 
> The patch below should fix this - it changes xfs_flush_inodes() to
> us sync_inodes_sb(), which will issue IO and block waiting for it to
> complete, just like xfs_flush_inodes() used to. Indeed, it passes
> again on my VM with 8GB RAM....
> 

I gave this a quick couple runs against 273 and it passes (on top of the
entire die-xfssyncd-die patchset). I'll kick off another full run on
this box overnight. Thanks!

Brian

> Cheers,
> 
> Dave.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02  0:44       ` Brian Foster
@ 2012-10-02 13:01         ` Brian Foster
  2012-10-02 20:51           ` Dave Chinner
  0 siblings, 1 reply; 41+ messages in thread
From: Brian Foster @ 2012-10-02 13:01 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 10/01/2012 08:44 PM, Brian Foster wrote:
> On 10/01/2012 08:10 PM, Dave Chinner wrote:
...
> 
> I gave this a quick couple runs against 273 and it passes (on top of
> the entire die-xfssyncd-die patchset). I'll kick off another full run
> on this box overnight. Thanks!
> 

And I spoke a bit too soon... I hit the following warning with this change:

WARNING: at fs/fs-writeback.c:1401 sync_inodes_sb+0xc0/0xd0()

The inline patch addresses it. I also see the following message during
273 but it doesn't appear related to this set:

kernel: XFS (dm-4): xlog_verify_grant_tail: space > BBTOB(tail_blocks)

Brian

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index da69c18..f11133b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -294,7 +294,9 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 static inline void
 xfs_flush_inodes(struct xfs_inode *ip)
 {
-	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
+	down_read(&VFS_I(ip)->i_sb->s_umount);
+	sync_inodes_sb(VFS_I(ip)->i_sb);
+	up_read(&VFS_I(ip)->i_sb->s_umount);
 }

 /*

> Brian
> 
>> Cheers,
>> 
>> Dave.
>> 
> 
> _______________________________________________ xfs mailing list 
> xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02  0:10     ` Dave Chinner
  2012-10-02  0:44       ` Brian Foster
@ 2012-10-02 13:22       ` Christoph Hellwig
  2012-10-02 20:24         ` Dave Chinner
  1 sibling, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2012-10-02 13:22 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, xfs

On Tue, Oct 02, 2012 at 10:10:22AM +1000, Dave Chinner wrote:
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index da69c18..0ec7a46 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -294,7 +294,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
>  static inline void
>  xfs_flush_inodes(struct xfs_inode *ip)
>  {
> -	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> +	sync_inodes_sb(VFS_I(ip)->i_sb);

sync_inodes_sb needs s_umount held and asserts that, while our callers
usually won't have it.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02  0:14       ` Dave Chinner
@ 2012-10-02 19:39         ` Ben Myers
  0 siblings, 0 replies; 41+ messages in thread
From: Ben Myers @ 2012-10-02 19:39 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs

Hey Dave,

On Tue, Oct 02, 2012 at 10:14:03AM +1000, Dave Chinner wrote:
> On Mon, Oct 01, 2012 at 12:44:10PM -0500, Ben Myers wrote:
> > Hi Dave,
> > 
> > On Fri, Sep 28, 2012 at 08:34:43AM -0400, Christoph Hellwig wrote:
> > > >  /*
> > > > + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
> > > 
> > > slightly too long line.
> > 
> > It comes out at 78 columns when the patch is applied.  Not a problem.
> > 
> > > > + * or a page lock.
> > > > + */
> > > > +static inline void
> > > > +xfs_flush_inodes(struct xfs_inode *ip)
> > > > +{
> > > > +	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> > > > +}
> > > 
> > > Passing an inode here seems very odd, an xfs_mount would be the proper
> > > interface. 
> > > 
> > > Looks fine modulo these cosmetic changes, so:
> > >
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > 
> > I cleaned up these cosmetic changes on your behalf.  Here is the result:
> 
> Looks good. Thanks for doing this, Ben.

No problem.  I wanted to pull this in before the merge window opened.
Otherwise I'd have let you go after it.

> FWIW, The patch I just posted to change
> writeback_inodes_sb_if_idle() to sync_inodes_sb() will need
> massaging to apply to your tree now. I'll update it if you push your
> tree out to oss.sgi.com before taking it...

Sure.  You can find it here:

git://oss.sgi.com/xfs/xfs.git, tmp-for-dave-worker-changes branch

Can you repost the full patch rather than a delta?  That would be cleaner.

Regards,
	Ben

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02 13:22       ` Christoph Hellwig
@ 2012-10-02 20:24         ` Dave Chinner
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-10-02 20:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Brian Foster, xfs

On Tue, Oct 02, 2012 at 09:22:05AM -0400, Christoph Hellwig wrote:
> On Tue, Oct 02, 2012 at 10:10:22AM +1000, Dave Chinner wrote:
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index da69c18..0ec7a46 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -294,7 +294,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
> >  static inline void
> >  xfs_flush_inodes(struct xfs_inode *ip)
> >  {
> > -	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> > +	sync_inodes_sb(VFS_I(ip)->i_sb);
> 
> sync_inodes_sb needs s_umount held and asserts that, while our callers
> usually won't have it.

Ah, bugger. I didn't notice that, and hadn't checked dmesg after I
ran 273 a few times with this patch. I just figured that because it
waited for completion, it didn't need it.

Adding s_umount is fine for the wirte path, but the create path
holds a directory i_mutex so I'd guess that means it has to be a
trylock to avoid lock inversion warnings from lockdep....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-01 20:14   ` Brian Foster
  2012-10-01 21:31     ` Mark Tinguely
  2012-10-02  0:10     ` Dave Chinner
@ 2012-10-02 20:25     ` Ben Myers
  2 siblings, 0 replies; 41+ messages in thread
From: Ben Myers @ 2012-10-02 20:25 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

On Mon, Oct 01, 2012 at 04:14:40PM -0400, Brian Foster wrote:
> On 09/28/2012 12:44 AM, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > We don't do any data writeback from XFS any more - the VFS is
> > completely responsible for that, including for freeze. We can
> > replace the remaining caller with the VFS level function that
> > achieves the same thing, but without conflicting with current
> > writeback work - writeback_inodes_sb_if_idle().
> > 
> > This means we can remove the flush_work and xfs_flush_inodes() - the
> > VFS functionality completely replaces the internal flush queue for
> > doing this writeback work in a separate context to avoid stack
> > overruns.
> > 
> > This does have one complication - it cannot be called with page
> > locks held.  Hence move the flushing of delalloc space when ENOSPC
> > occurs back up into xfs_file_aio_buffered_write when we don't hold
> > any locks that will stall writeback.
> > 
> > Note that we always need to pass a count of zero to
> > generic_file_buffered_write() as the previously written byte count.
> > We only do this by accident before this patch by the virtue of ret
> > always being zero when there are no errors. Make this explicit
> > rather than needing to specifically zero ret in the ENOSPC retry
> > case.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> 
> Heads up... I was doing some testing against my eofblocks set rebased
> against this patchset and I'm reproducing a new 273 failure. The failure
> bisects down to this patch.

Nice catch Brian!

Regards,
	Ben

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02 13:01         ` Brian Foster
@ 2012-10-02 20:51           ` Dave Chinner
  2012-10-02 21:13             ` Brian Foster
  2012-10-04  0:05             ` Ben Myers
  0 siblings, 2 replies; 41+ messages in thread
From: Dave Chinner @ 2012-10-02 20:51 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs

On Tue, Oct 02, 2012 at 09:01:33AM -0400, Brian Foster wrote:
> On 10/01/2012 08:44 PM, Brian Foster wrote:
> > On 10/01/2012 08:10 PM, Dave Chinner wrote:
> ...
> > 
> > I gave this a quick couple runs against 273 and it passes (on top of
> > the entire die-xfssyncd-die patchset). I'll kick off another full run
> > on this box overnight. Thanks!
> > 
> 
> And I spoke a bit too soon... I hit the following warning with this change:
> 
> WARNING: at fs/fs-writeback.c:1401 sync_inodes_sb+0xc0/0xd0()
> 
> The inline patch addresses it. I also see the following message during
> 273 but it doesn't appear related to this set:
> 
> kernel: XFS (dm-4): xlog_verify_grant_tail: space > BBTOB(tail_blocks)
> 
> Brian
> 
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index da69c18..f11133b 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -294,7 +294,9 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
>  static inline void
>  xfs_flush_inodes(struct xfs_inode *ip)
>  {
> -	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> +	down_read(&VFS_I(ip)->i_sb->s_umount);
> +	sync_inodes_sb(VFS_I(ip)->i_sb);
> +	up_read(&VFS_I(ip)->i_sb->s_umount);
>  }

I don't think we can do an unconditional down_read() there, as the
caller from xfs_create() already holds an i_mutex (the VFS holds the
directory inode lock) and I'm pretty sure that s_umount is supposed
to be outside per-inode locks.

Given that where we are called we are inside a transaction for the
create case, and inside mnt_want_write() protection for the buffered
write case, the likelyhood of s_umount being held for write at
ENOSPC is going to be non-existent at these call sites. Hence a
down_read_trylock() will avoid lock ordering issues, but will almost
always succeed and so be equivalent to down_read()....

/me modifies and runs 273 and the enospc xfstests group...

Seems to work just fine, and no warnings. Patch below.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

xfs: make inode writeback at ENOSPC blocking.

From: Dave Chinner <dchinner@redhat.com>

writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc
conversion fast enough to prevent spurious ENOSPC whent here are
hundreds of writers, thousands of small files and GBs of free RAM.
Change this to use sync_sb_inodes() to block callers while we wait
for writeback like the previous xfs_flush_inodes implementation did.

We have to hold the s_umount lock here, but because this call can
nest inside i_mutex (the parent directory in the create case, held
by the VFS), we have to use down_read_trylock() to avoid potential
deadlocks. In practice, this trylock will succeed on almost every
attempt as unmount/remount type operations are exceedingly rare.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_inode.h |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index da69c18..b3dabe9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -294,7 +294,12 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
 static inline void
 xfs_flush_inodes(struct xfs_inode *ip)
 {
-	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
+	struct super_block *sb = VFS_I(ip)->i_sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		sync_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
 }
 
 /*

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02 20:51           ` Dave Chinner
@ 2012-10-02 21:13             ` Brian Foster
  2012-10-04  0:05             ` Ben Myers
  1 sibling, 0 replies; 41+ messages in thread
From: Brian Foster @ 2012-10-02 21:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: xfs

On 10/02/2012 04:51 PM, Dave Chinner wrote:
> On Tue, Oct 02, 2012 at 09:01:33AM -0400, Brian Foster wrote:
>> On 10/01/2012 08:44 PM, Brian Foster wrote:
>>> On 10/01/2012 08:10 PM, Dave Chinner wrote:
>> ...
>>>
...
> 
> I don't think we can do an unconditional down_read() there, as the
> caller from xfs_create() already holds an i_mutex (the VFS holds the
> directory inode lock) and I'm pretty sure that s_umount is supposed
> to be outside per-inode locks.
> 
> Given that where we are called we are inside a transaction for the
> create case, and inside mnt_want_write() protection for the buffered
> write case, the likelyhood of s_umount being held for write at
> ENOSPC is going to be non-existent at these call sites. Hence a
> down_read_trylock() will avoid lock ordering issues, but will almost
> always succeed and so be equivalent to down_read()....
> 

Ok...

> /me modifies and runs 273 and the enospc xfstests group...
> 
> Seems to work just fine, and no warnings. Patch below.
> 

This passes 273 clean (save the xlog_verify_grant_tail message) for me
as well. Thanks.

Brian

> Cheers,
> 
> Dave.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-02 20:51           ` Dave Chinner
  2012-10-02 21:13             ` Brian Foster
@ 2012-10-04  0:05             ` Ben Myers
  2012-10-04  1:07               ` Dave Chinner
  1 sibling, 1 reply; 41+ messages in thread
From: Ben Myers @ 2012-10-04  0:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, xfs

Hey Dave,

On Wed, Oct 03, 2012 at 06:51:24AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc
> conversion fast enough to prevent spurious ENOSPC whent here are
> hundreds of writers, thousands of small files and GBs of free RAM.
> Change this to use sync_sb_inodes() to block callers while we wait
> for writeback like the previous xfs_flush_inodes implementation did.
> 
> We have to hold the s_umount lock here, but because this call can
> nest inside i_mutex (the parent directory in the create case, held
> by the VFS), we have to use down_read_trylock() to avoid potential
> deadlocks. In practice, this trylock will succeed on almost every
> attempt as unmount/remount type operations are exceedingly rare.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_inode.h |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index da69c18..b3dabe9 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -294,7 +294,12 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
>  static inline void
>  xfs_flush_inodes(struct xfs_inode *ip)
>  {
> -	writeback_inodes_sb_if_idle(VFS_I(ip)->i_sb, WB_REASON_FS_FREE_SPACE);
> +	struct super_block *sb = VFS_I(ip)->i_sb;
> +
> +	if (down_read_trylock(&sb->s_umount)) {
> +		sync_inodes_sb(sb);
> +		up_read(&sb->s_umount);
> +	}
>  }
>  
>  /*
> 

Is this the one you want to go with?  I'd prefer to apply this to patch 6
itself rather than check in the regression followed by the fix.  If you'd like
resubmit this patch that's great, otherwise I'll be happy to repost the result.

Thanks again Brian.

Regards,
Ben

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 06/13] xfs: xfs_sync_data is redundant.
  2012-10-04  0:05             ` Ben Myers
@ 2012-10-04  1:07               ` Dave Chinner
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Chinner @ 2012-10-04  1:07 UTC (permalink / raw)
  To: Ben Myers; +Cc: Brian Foster, xfs

On Wed, Oct 03, 2012 at 07:05:52PM -0500, Ben Myers wrote:
> Hey Dave,
> 
> On Wed, Oct 03, 2012 at 06:51:24AM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc
> > conversion fast enough to prevent spurious ENOSPC whent here are
> > hundreds of writers, thousands of small files and GBs of free RAM.
> > Change this to use sync_sb_inodes() to block callers while we wait
> > for writeback like the previous xfs_flush_inodes implementation did.
> > 
> > We have to hold the s_umount lock here, but because this call can
> > nest inside i_mutex (the parent directory in the create case, held
> > by the VFS), we have to use down_read_trylock() to avoid potential
> > deadlocks. In practice, this trylock will succeed on almost every
> > attempt as unmount/remount type operations are exceedingly rare.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
<snip>

> Is this the one you want to go with?  I'd prefer to apply this to patch 6
> itself rather than check in the regression followed by the fix.  If you'd like
> resubmit this patch that's great, otherwise I'll be happy to repost the result.

Yes, this is the version that is necessary. Folding it back
into the original patch is fine as long as you fold the commit
message as well to explain why it was implemented this way.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2012-10-04  1:06 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-28  4:44 [PATCH V3 00/13] xfs: remove the xfssyncd mess Dave Chinner
2012-09-28  4:44 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
2012-09-28  4:44 ` [PATCH 02/13] xfs: rationalise xfs_mount_wq users Dave Chinner
2012-09-28 12:27   ` Christoph Hellwig
2012-09-28  4:44 ` [PATCH 03/13] xfs: don't run the sync work if the filesystem is read-only Dave Chinner
2012-09-28 12:27   ` Christoph Hellwig
2012-09-28  4:44 ` [PATCH 04/13] xfs: sync work is now only periodic log work Dave Chinner
2012-09-28 12:31   ` Christoph Hellwig
2012-09-28  4:44 ` [PATCH 05/13] xfs: Bring some sanity to log unmounting Dave Chinner
2012-09-28  4:44 ` [PATCH 06/13] xfs: xfs_sync_data is redundant Dave Chinner
2012-09-28 12:34   ` Christoph Hellwig
2012-10-01 17:44     ` Ben Myers
2012-10-02  0:14       ` Dave Chinner
2012-10-02 19:39         ` Ben Myers
2012-09-28 21:44   ` Mark Tinguely
2012-10-01 20:14   ` Brian Foster
2012-10-01 21:31     ` Mark Tinguely
2012-10-02  0:10     ` Dave Chinner
2012-10-02  0:44       ` Brian Foster
2012-10-02 13:01         ` Brian Foster
2012-10-02 20:51           ` Dave Chinner
2012-10-02 21:13             ` Brian Foster
2012-10-04  0:05             ` Ben Myers
2012-10-04  1:07               ` Dave Chinner
2012-10-02 13:22       ` Christoph Hellwig
2012-10-02 20:24         ` Dave Chinner
2012-10-02 20:25     ` Ben Myers
2012-09-28  4:44 ` [PATCH 07/13] xfs: syncd workqueue is no more Dave Chinner
2012-09-28 12:35   ` Christoph Hellwig
2012-09-28 18:17   ` Mark Tinguely
2012-10-01 17:54     ` Ben Myers
2012-09-28  4:44 ` [PATCH 08/13] xfs: xfs_sync_fsdata is redundant Dave Chinner
2012-09-28  4:44 ` [PATCH 09/13] xfs: move xfs_quiesce_attr() into xfs_super.c Dave Chinner
2012-09-28  4:44 ` [PATCH 10/13] xfs: xfs_quiesce_attr() should quiesce the log like unmount Dave Chinner
2012-09-28  4:44 ` [PATCH 11/13] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] Dave Chinner
2012-09-28  4:44 ` [PATCH 12/13] xfs: move inode locking functions to xfs_inode.c Dave Chinner
2012-09-28  4:44 ` [PATCH 13/13] xfs: remove xfs_iget.c Dave Chinner
  -- strict thread matches above, loose matches on Subject: below --
2012-08-30 12:00 [PATCH V2 00/13] xfs: remove the xfssyncd mess Dave Chinner
2012-08-30 12:00 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner
2012-09-01 23:15   ` Christoph Hellwig
2012-09-04 16:10   ` Mark Tinguely
2012-08-30 10:57 [PATCH 00/13] xfs: remove the xfssyncd mess Dave Chinner
2012-08-30 10:57 ` [PATCH 01/13] xfs: xfs_syncd_stop must die Dave Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox